In [None]:
!pip list

In [None]:
# Install a specific version of numpy
!pip install numpy==1.24.4

# Install a specific version of numba
!pip install numba==0.57.1

# Reinstall scanpy and squidpy
!pip uninstall -y scanpy squidpy
!pip install scanpy squidpy

In [None]:
!pip show numpy

In [None]:
!pip install numpy==1.24.0

In [None]:
import scanpy as sc
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
adata = sc.read_h5ad("/data/vasileiosionat2/Xenium/tabula_sapiens_epithelial/combined.h5ad")

In [None]:
adata.obs

In [None]:
adata_Wil = adata[adata.obs['ref'] == 'Williams 2021'].copy()

In [None]:
adata_Wil.obs

In [None]:
adata_Wil_Epi = adata_Wil[adata_Wil.obs['generalCellTypes'] == 'Epithelial'].copy()

In [None]:
adata_Wil_GEpi = adata_Wil_Epi[adata_Wil_Epi.obs['tissue'] == 'gingiva'].copy()

In [None]:
adata_Wil_GEpi.obs

In [None]:
adata_Wil_GEpi.obs['orig.ident'].unique().tolist()

In [None]:
ref_Xen = sc.read_h5ad("/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad")

In [None]:
ref_Xen_Epi = ref_Xen[ref_Xen.obs['Lvl3'].isin(["Crev", "Basal", "Spinous", "Keratin"])].copy()


In [None]:
ref_Xen_Epi.obs

In [None]:
assay_data = pd.Series('scSeq', index=adata_Wil_GEpi.obs.index)
adata_Wil_GEpi.obs['assay'] = assay_data

merged = adata_Wil_GEpi.concatenate(
    ref_Xen_Epi, batch_key="assay", batch_categories=["scSeq", "xenium"]
)

In [None]:
sc.pp.scale(merged)
sc.tl.pca(merged)

In [None]:
# This step takes time; ~10 min per iteration
sc.external.pp.harmony_integrate(merged, key="assay", max_iter_harmony=20, max_iter_kmeans=30)

In [None]:
# Visualize merged with UMAP embedding; takes a long time
sc.pp.neighbors(merged, n_neighbors=50, use_rep="X_pca_harmony", metric="correlation")
sc.tl.umap(merged, min_dist=0.5)

In [None]:
merged.obs

In [None]:
sc.pl.umap(merged, color='assay')

In [None]:
# Transfer annotations from xenium to scRNAseq
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged[merged.obs["assay"] == "xenium"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['Lvl3']) 
labels = nn.predict(merged[merged.obs["assay"] == "scSeq"].obsm["X_pca_harmony"])
merged.obs["xenium_to_sc_label"] = pd.Series(labels, index=merged[merged.obs["assay"] == "scSeq"].obs.index)

In [None]:
merged_sc = merged[merged.obs['assay'] == 'scSeq'].copy()

In [None]:
# Visualize xenium annotations vs transferred scRNAseq annotations
xen_obj = merged[merged.obs['assay']=='scSeq']
xen_obj.obs['clusterCellTypes'] = xen_obj.obs['clusterCellTypes'].astype('str')
celltype_counts = pd.DataFrame(xen_obj.obs.groupby(['clusterCellTypes','xenium_to_sc_label']).size()).unstack()
celltype_counts.columns = celltype_counts.columns.droplevel()
celltype_counts.index.name = 'sc cell type'
celltype_counts.columns.name = 'predicted Xenium cell type'
celltype_counts = celltype_counts.T
# Row scale co-occurrence frequencies (by predicted scRNAseq cell type)
celltype_counts = celltype_counts.div(celltype_counts.sum(axis=1), axis=0) 
celltype_counts = celltype_counts.loc[:,celltype_counts.idxmax(axis=0).sort_values().index]
celltype_counts = celltype_counts.fillna(0)

In [None]:
column_sums = celltype_counts.abs().sum(axis=0)
print(column_sums)
# Select columns to keep based on the threshold
columns_to_keep = column_sums[column_sums >= 0.1].index
print(columns_to_keep)
celltype_counts2 = celltype_counts[columns_to_keep]
print(celltype_counts2)

In [None]:
plt.figure(figsize = (14,6))
sns.heatmap(celltype_counts2, cmap='YlGnBu')

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = merged_sc[merged_sc.obs['clusterCellTypes'].isin(["Epithelial 1", "Epithelial 2", "Epithelial 3", "Cycling"])].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'clusterCellTypes', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['clusterCellTypes'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='clusterCellTypes')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_clusterCellTypes']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='clusterCellTypes',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
import scanpy as sc
# Start plotting with black background
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['pdf.fonttype'] = 42  # Ensures fonts are embedded as text, not outlines
plt.rcParams['ps.fonttype'] = 42

# Define the subset of clusters you want to include and their desired order
clusters_of_interest = ["Epithelial 1", "Epithelial 2", "Epithelial 3"]

# Define the subset of genes you want to plot in the desired order
correct_order = [
   'KRT5', 'COL17A1', 'DST', 'ITGB4', 'CXCL14', 'C1R', 'IL1R2',
      'CAVIN1', 'PDPN', 'BASP1',
'SLC26A2', 'MYC', 'CFH', 'PTN',
'MKI67', 'CDK1', 'AQP3', 'KRT19', 'FGFBP1', 'S100A16', 'LY6D', 'SDC1', 
  'CYP4B1', 'MAMDC2', 'SERPINB3', 'NOD2',
'S100A16', 'SLPI', 'IL1RN', 'ANXA1',
'C15orf48', 'IL36A','CNFN', 'IL36G',
 'SERPINB2', 'EHF',  'ERBB2', 'ODAM', 'ODAPH', 'LAMC2', 'CXCL1', 'CXCL2', 'CXCL6', 'SLPI',
'IL1A', 'IL36G', 'C15orf48', 'ANXA1', 'DNASE1L3',
'COL17A1', 'DST', 'C1R', 'ITGB4', 'SAA1', 'SAA2',
'CYP4B1', 'CFH', 'EHF',  'CLEC7A', 'CFHR3'
]

# Subset the AnnData object to include only the clusters of interest
adata_subset = adata_Wil_GEpi[adata_Wil_GEpi.obs['clusterCellTypes'].isin(clusters_of_interest)].copy()

# Ensure clusters are ordered as specified
adata_subset.obs['clusterCellTypes'] = adata_subset.obs['clusterCellTypes'].astype(
    pd.CategoricalDtype(categories=clusters_of_interest, ordered=True)
)

# Create the DotPlot
dotplot = sc.pl.DotPlot(
    adata_subset,
    var_names=correct_order,       # Genes on the x-axis
    groupby='clusterCellTypes',                # Clusters on the y-axis
    standard_scale='var',          # Apply standard scaling across variables
    vmin=0.2,
    vmax=0.8,                       # Set a max value for the scale
    figsize=(20, 4)

)

# Transpose the axes using the swap_axes method
dotplot = dotplot.swap_axes(swap_axes=False)

# Modify the style for grayscale and remove outlines
dotplot = dotplot.style(
    cmap="Greys",               # Use grayscale colormap
    dot_edge_color=None,        # Remove dot outlines
    dot_edge_lw=0,              # No line width for edges
    grid=False,                 # Optional: Disable grid lines
    dot_min=0.1,                # Minimum dot size
    dot_max=0.8,                 # Maximum dot size
)

# Show the plot
dotplot.show()

# Save the plot as SVG with a transparent background
output_path = '/data/vasileiosionat2/IBEX_FINAL/Scimap/Outputs/Figures_paper/Figure4/Xenium_Total_Epi_Lvl5_dotplot.pdf'
#dotplot.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
import scanpy as sc
# Start plotting with black background
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['pdf.fonttype'] = 42  # Ensures fonts are embedded as text, not outlines
plt.rcParams['ps.fonttype'] = 42

# Define the subset of clusters you want to include and their desired order
clusters_of_interest = ["Crev", "Basal", "Spinous", "Keratin"]

# Define the subset of genes you want to plot in the desired order
correct_order = [
   'KRT5', 'COL17A1', 'DST', 'ITGB4', 'CXCL14', 'C1R', 'IL1R2',
      'CAVIN1', 'PDPN', 'BASP1',
'SLC26A2', 'MYC', 'CFH', 'PTN',
'MKI67', 'CDK1', 'AQP3', 'KRT19', 'FGFBP1', 'S100A16', 'LY6D', 'SDC1', 
  'CYP4B1', 'MAMDC2', 'SERPINB3', 'NOD2',
'S100A16', 'SLPI', 'IL1RN', 'ANXA1',
'C15orf48', 'IL36A','CNFN', 'IL36G',
 'SERPINB2', 'EHF',  'ERBB2', 'ODAM', 'ODAPH', 'LAMC2', 'CXCL1', 'CXCL2', 'CXCL6', 'SLPI',
'IL1A', 'IL36G', 'C15orf48', 'ANXA1', 'DNASE1L3',
'COL17A1', 'DST', 'C1R', 'ITGB4', 'SAA1', 'SAA2',
'CYP4B1', 'CFH', 'EHF',  'CLEC7A', 'CFHR3'
]

# Subset the AnnData object to include only the clusters of interest
adata_subset = adata_Wil_GEpi[adata_Wil_GEpi.obs['xenium_to_sc_label'].isin(clusters_of_interest)].copy()

# Ensure clusters are ordered as specified
adata_subset.obs['xenium_to_sc_label'] = adata_subset.obs['xenium_to_sc_label'].astype(
    pd.CategoricalDtype(categories=clusters_of_interest, ordered=True)
)

# Create the DotPlot
dotplot = sc.pl.DotPlot(
    adata_subset,
    var_names=correct_order,       # Genes on the x-axis
    groupby='xenium_to_sc_label',                # Clusters on the y-axis
    standard_scale='var',          # Apply standard scaling across variables
    vmin=0.2,
    vmax=0.8,                       # Set a max value for the scale
    figsize=(20, 4)

)

# Transpose the axes using the swap_axes method
dotplot = dotplot.swap_axes(swap_axes=False)

# Modify the style for grayscale and remove outlines
dotplot = dotplot.style(
    cmap="Greys",               # Use grayscale colormap
    dot_edge_color=None,        # Remove dot outlines
    dot_edge_lw=0,              # No line width for edges
    grid=False,                 # Optional: Disable grid lines
    dot_min=0.1,                # Minimum dot size
    dot_max=0.8,                 # Maximum dot size
)

# Show the plot
dotplot.show()

# Save the plot as SVG with a transparent background
output_path = '/data/vasileiosionat2/IBEX_FINAL/Scimap/Outputs/Figures_paper/Figure4/Xenium_Total_Epi_Lvl5_dotplot.pdf'
#dotplot.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
adata.var

In [None]:
category_counts = merged_sc.obs["clusterCellTypes"].value_counts()
print(category_counts)

In [None]:
category_counts = merged_sc.obs["xenium_to_sc_label"].value_counts()
print(category_counts)

In [None]:
merged_sc.obs.index = merged_sc.obs.index.str.replace("-scSeq", "", regex=False)

In [None]:
adata_Wil_GEpi.obs["xenium_to_sc_label"] = merged_sc.obs["xenium_to_sc_label"].reindex(adata_Wil_GEpi.obs.index)

In [None]:
adata_ts = sc.read_h5ad("/data/vasileiosionat2/Xenium/tabula_sapiens_epithelial/676b8605-7e5f-42c3-940a-aa7042c50f63.h5ad")

In [None]:
adata_ts.obs

In [None]:
adata_ts.obs['tissue_type'].unique().tolist()

In [None]:
adata_Wil_HGEpi = adata_Wil_GEpi[adata_Wil_GEpi.obs['status'] == 'H'].copy()

In [None]:
category_counts = adata_Wil_HGEpi.obs["xenium_to_sc_label"].value_counts()
print(category_counts)

In [None]:
adata_Wil_HGEpi.obs

In [None]:
adata_Wil_HGEpi.obs["cell_type_integration"] = adata_Wil_HGEpi.obs["tissue"].astype(str) + "_" + adata_Wil_HGEpi.obs["xenium_to_sc_label"].astype(str)

In [None]:
adata_Wil_HGEpi.obs

In [None]:
adata_ts.obs["cell_type_integration"] = adata_ts.obs["tissue"].astype(str) + "_" + adata_ts.obs["cell_type"].astype(str)

In [None]:
adata_ts.obs

In [None]:
adata_ts.obs['cell_type_integration'].unique().tolist()

In [None]:
import scanpy as sc

# Assuming adata1 and adata2 are your two AnnData objects
# First, check that the two AnnData objects have a common index in their 'obs' columns
common_obs_columns = adata_ts.obs.columns.intersection(adata_Wil_HGEpi.obs.columns)
common_obs_columns

In [None]:
adata_ts.var.index = adata_ts.var.index.astype(str)
adata_Wil_HGEpi.var.index = adata_Wil_HGEpi.var.index.astype(str)

In [None]:
adata_Wil_HGEpi.var

In [None]:
import pandas as pd
import scanpy as sc

# Step 1: Standardize column names
adata_ts.var.columns = adata_ts.var.columns.str.lower()
adata_Wil_HGEpi.var.columns = adata_Wil_HGEpi.var.columns.str.lower()

# Step 2: If adata2.var has only one column, rename it
if adata_Wil_HGEpi.var.shape[1] == 1:
    adata_Wil_HGEpi.var.rename(columns={adata_Wil_HGEpi.var.columns[0]: "feature_name"}, inplace=True)  # Adjust if necessary

# Step 3: Align var_names (index) based on Ensembl IDs or Gene Names
# If `adata1.var` contains Ensembl IDs, ensure `adata2` matches
if "feature_name" in adata_ts.var.columns:
    adata_ts.var.set_index("feature_name", inplace=True)
    
if "feature_name" in adata_Wil_HGEpi.var.columns:
    adata_Wil_HGEpi.var.set_index("feature_name", inplace=True)

# Step 4: Identify common genes (after renaming/indexing)
common_genes = adata_ts.var_names.intersection(adata_Wil_HGEpi.var_names)

# Step 5: Subset both datasets to only include shared genes
adata_ts = adata_ts[:, common_genes].copy()
adata_Wil_HGEpi = adata_Wil_HGEpi[:, common_genes].copy()

# Step 6: Merge metadata from `adata1.var` into `adata2.var`
adata_Wil_HGEpi.var = adata_ts.var.loc[common_genes]  # Ensure `adata2.var` matches `adata1.var`

# Step 7: Concatenate
adata_combined = adata_ts.concatenate(adata_Wil_HGEpi, join='inner', batch_key="batch")

# Step 8: Restore original `var` structure
adata_combined.var = adata_ts.var  # Retain original metadata from `adata1`


In [None]:
sc.pp.normalize_total(adata_combined, target_sum=1e4)  # Normalize counts per cell
sc.pp.log1p(adata_combined)  # Log-transform
sc.pp.highly_variable_genes(adata_combined, flavor='seurat', n_top_genes=2000)

In [None]:
adata_combined = adata_combined[:, adata_combined.var['highly_variable']].copy()
sc.pp.scale(adata_combined)

In [None]:
adata_combined.var

In [None]:
adata_combined.obs

In [None]:
sc.tl.pca(adata_combined)

In [None]:
adata_combined.write('/data/vasileiosionat2/Xenium/tabula_sapiens_epithelial/ts_oralscSeq.h5ad')

In [None]:
adata_combined = sc.read_h5ad("/data/vasileiosionat2/Xenium/tabula_sapiens_epithelial/ts_oralscSeq.h5ad")

In [None]:
adata_combined.obs

In [None]:
# This step takes time; ~10 min per iteration
sc.external.pp.harmony_integrate(adata_combined, key="batch", max_iter_harmony=20, max_iter_kmeans=30)

In [None]:
# Visualize merged with UMAP embedding; takes a long time
sc.pp.neighbors(adata_combined, n_neighbors=30, use_rep="X_pca_harmony", metric="correlation")
sc.tl.umap(adata_combined, min_dist=0.8)

In [None]:
sc.pl.umap(adata_combined, color='tissue_in_publication')

In [None]:
adata_combined.obs['tissue_in_publication'] = adata_combined.obs['tissue_in_publication'].cat.add_categories('Gingiva-TAE')
adata_combined.obs['tissue_in_publication'] = adata_combined.obs['tissue_in_publication'].cat.add_categories('Gingiva-OE')

# Now assign the value 'gingiva' to the 'tissue_in_publication' column where 'batch' is 1
adata_combined.obs.loc[adata_combined.obs['batch'] == "1", 'tissue_in_publication'] = 'Gingiva-OE'
# Now assign the value 'gingiva' to the 'tissue_in_publication' column where 'batch' is 1
adata_combined.obs.loc[adata_combined.obs['cell_type_integration'] == "gingiva_Crev", 'tissue_in_publication'] = 'Gingiva-TAE'

In [None]:
adata_combined.obs['tissue_in_publication'].unique().tolist()

In [None]:
adata_combined_barrier = adata_combined[adata_combined.obs['tissue_in_publication'].isin(["Lung", "Large_Intestine", "Skin",
                                                                                          "Small_Intestine", "Tongue", "Gingiva-OE", "Gingiva-TAE"])].copy()

In [None]:
adata_combined_barrier.obs['cell_type_integration'].unique().tolist()

In [None]:
adata_combined_barrier_Epi = adata_combined_barrier[adata_combined_barrier.obs['cell_type_integration'].isin(['lung_respiratory goblet cell',
 'lung_basal cell',
 'lung_pulmonary alveolar type 2 cell',
 'lung_pulmonary alveolar type 1 cell',
 'skin of chest_epithelial cell',
 'skin of abdomen_epithelial cell',
 'lung_lung ciliated cell',
 'lung_club cell',
 'posterior part of tongue_basal cell',
 'anterior part of tongue_basal cell',
 'anterior part of tongue_stratified squamous epithelial cell',
 'large intestine_enterocyte of epithelium of large intestine',
 'large intestine_paneth cell of colon',
 'large intestine_intestinal crypt stem cell of colon',
 'large intestine_tuft cell of colon',
 'small intestine_enterocyte of epithelium proper of small intestine',
 'small intestine_paneth cell of epithelium of small intestine',
 'small intestine_intestinal crypt stem cell of small intestine',
 'small intestine_small intestine goblet cell',
 'small intestine_intestinal tuft cell',
 'tongue_basal cell',
 'tongue_stratified squamous epithelial cell',
 'large intestine_large intestine goblet cell',
 'large intestine_enterochromaffin-like cell', 
 'small intestine_BEST4+ intestinal epithelial cell, human',
 'small intestine_enterocyte of epithelium proper of ileum',
 'small intestine_enterocyte of epithelium proper of duodenum', 
 'small intestine_transit amplifying cell of small intestine', 
 'ascending colon_large intestine goblet cell',
 'ascending colon_intestinal crypt stem cell of colon',
 'ascending colon_tuft cell of colon',
 'ascending colon_enterocyte of epithelium of large intestine',
 'ascending colon_BEST4+ intestinal epithelial cell, human',
 'ascending colon_transit amplifying cell of colon',
 'ascending colon_enterochromaffin-like cell', 'ileum_enterocyte of epithelium proper of ileum',
 'ileum_small intestine goblet cell',
 'ileum_intestinal crypt stem cell of small intestine',
 'ileum_paneth cell of epithelium of small intestine',
 'ileum_intestinal tuft cell',
 'ileum_BEST4+ intestinal epithelial cell, human', 
 'duodenum_enterocyte of epithelium proper of duodenum',
 'duodenum_BEST4+ intestinal epithelial cell, human',
 'duodenum_paneth cell of epithelium of small intestine',
 'duodenum_small intestine goblet cell',
 'duodenum_intestinal crypt stem cell of small intestine', 'duodenum_intestinal tuft cell',
 'duodenum_transit amplifying cell of small intestine',
 'sigmoid colon_large intestine goblet cell',
 'sigmoid colon_intestinal crypt stem cell of colon',
 'sigmoid colon_BEST4+ intestinal epithelial cell, human',
 'sigmoid colon_enterocyte of epithelium of large intestine',
 'sigmoid colon_transit amplifying cell of colon',
 'sigmoid colon_paneth cell of colon',
 'large intestine_transit amplifying cell of colon', 
  'posterior part of tongue_stratified squamous epithelial cell', 'sigmoid colon_enterochromaffin-like cell',
 'sigmoid colon_tuft cell of colon',
 'ascending colon_paneth cell of colon',
 'duodenum_enterocyte of epithelium proper of ileum', 
 'jejunum_enterocyte of epithelium proper of jejunum',
 'jejunum_intestinal crypt stem cell of small intestine',
 'jejunum_intestinal tuft cell',
 'jejunum_paneth cell of epithelium of small intestine',
 'jejunum_BEST4+ intestinal epithelial cell, human',
 'jejunum_small intestine goblet cell', 'gingiva_Crev',
 'gingiva_Basal',
 'gingiva_Spinous',
 'gingiva_Keratin'])].copy()

In [None]:
adata_combined_barrier_Epi.obs['cell_type_integration'].unique().tolist()

In [None]:
adata_combined_barrier_Epi.obs

In [None]:
import scanpy as sc

# Assuming you've already harmonized and performed PCA

# Step 1: Perform PCA if not already done (assuming PCA is required)
# You can skip this if PCA is already available
sc.pp.pca(adata_combined_barrier_Epi, n_comps=20)

# Step 2: Compute the nearest neighbors
sc.pp.neighbors(adata_combined_barrier_Epi, use_rep='X_pca', n_neighbors=15)

# Step 3: Perform Leiden clustering with a low resolution for coarse clustering
sc.tl.leiden(adata_combined_barrier_Epi, resolution=0.1)  # A lower value gives fewer, coarser clusters

# Step 4: Visualize the coarse clusters
sc.pl.umap(adata_combined_barrier_Epi, color=['leiden'])


In [None]:
sc.pl.umap(adata_combined_barrier_Epi, color=['tissue_in_publication'])

In [None]:
adata_combined_barrier_Epi.obs['tissue_in_publication'].unique().tolist()

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata_combined.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata_combined_barrier_Epi[adata_combined_barrier_Epi.obs['cell_type_integration'].isin([
 'skin of chest_epithelial cell',
 'posterior part of tongue_basal cell',
 'anterior part of tongue_basal cell',
 'anterior part of tongue_stratified squamous epithelial cell',
 'eye_conjunctival epithelial cell',
 'tongue_basal cell',
 'tongue_stratified squamous epithelial cell',
 'ocular surface region_conjunctival epithelial cell',
 'ocular surface region_corneal epithelial cell',
 'sclera_conjunctival epithelial cell',
'anterior segment of eyeball_conjunctival epithelial cell',
'posterior part of tongue_stratified squamous epithelial cell',
 'conjunctiva_conjunctival epithelial cell',
'posterior segment of eyeball_conjunctival epithelial cell', 'cornea_corneal epithelial cell',
'eyelid_conjunctival epithelial cell', 'gingiva_Crev', 'gingiva_Basal', 'gingiva_Spinous',
'gingiva_Keratin'
])]

# Ensure 'leiden' is categorical
adata_subset.obs['cell_type_integration'] = adata_subset.obs['cell_type_integration'].astype('category')

# Step 2: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 3: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'cell_type_integration', method='wilcoxon', use_raw=False)

# Step 4: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['cell_type_integration'].cat.categories:
    # Ensure that you're correctly accessing the gene names using a valid index
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 5: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='cell_type_integration')

# Step 6: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_cell_type_integration']['categories_ordered']

# Step 7: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            # Ensure that the gene is not already added to the unique genes set
            if len(genes_for_this_cluster) >= 20:  # Adjust to your desired number of genes
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 8: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='cell_type_integration',  # Ensure 'leiden' is used here for clustering
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
adata_combined_barrier_Epi.obs['tissue_in_publication'] = adata_combined_barrier_Epi.obs['tissue_in_publication'].cat.add_categories('Gingiva-TAE')

# Now assign the value 'gingiva' to the 'tissue_in_publication' column where 'batch' is 1
adata_combined_barrier_Epi.obs.loc[adata_combined_barrier_Epi.obs['cell_type_integration'] == "gingiva_Crev", 'tissue_in_publication'] = 'Gingiva-TAE'

In [None]:
import scanpy as sc
import pandas as pd

# Step 1: Ensure all observation names are unique
adata_combined_barrier_Epi.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata_combined_barrier_Epi[adata_combined_barrier_Epi.obs['tissue_in_publication'].isin(['Lung',
 'Tongue',
 'Skin',
 'Large_Intestine',
 'Small_Intestine',
 'Gingiva-OE',
 'Gingiva-TAE'
])]

# Ensure 'leiden' is categorical
adata_subset.obs['tissue_in_publication'] = adata_subset.obs['tissue_in_publication'].astype('category')

# Step 2: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 3: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'tissue_in_publication', method='wilcoxon', use_raw=False)

# Step 4: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['tissue_in_publication'].cat.categories:
    # Ensure that you're correctly accessing the gene names using a valid index
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:50]  # Top 30 genes

# Create a DataFrame where each column corresponds to a tissue and each row contains one of the top 30 genes
df_top_genes = pd.DataFrame.from_dict(top_genes_per_cluster, orient='index').transpose()

# Save to Excel
#df_top_genes.to_excel("top_marker_genes_by_tissue.xlsx", index=False)

# Step 5: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='tissue_in_publication')

# Step 6: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_tissue_in_publication']['categories_ordered']

# Step 7: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            # Ensure that the gene is not already added to the unique genes set
            if len(genes_for_this_cluster) >= 30:  # Adjust to your desired number of genes
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 8: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='tissue_in_publication',  # Ensure 'leiden' is used here for clustering
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True,
)


In [None]:
df_top_genes

In [None]:
import scanpy as sc
# Start plotting with black background
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['pdf.fonttype'] = 42  # Ensures fonts are embedded as text, not outlines
plt.rcParams['ps.fonttype'] = 42

# Define the subset of clusters you want to include and their desired order
clusters_of_interest = [ 'Skin',
 'Large_Intestine',
 'Small_Intestine',
 'Lung',
 'Tongue',
 'Gingiva-OE',
 'Gingiva-TAE']

# Define the subset of genes you want to plot in the desired order
correct_order = [
   'DCD', 'SEC11C', 'CEACAM5', 'MUC12', 'KRT20', 'PIGR', 'MUC13',
      'MUC17', 'MME', 'SFTPB',
'IFITM3', 'SLPI', 'CXCL17', 'KRT13',
'KRT5', 'KRT6A',  'S100A8', 'S100A9', 'CXCL14', 'KRT4', 'ODAM', 'MMP12', 'FDCSP', 'SAA1', 
  'SAA2', 'DNASE1L3', 'CXCL1', 'CXCL8', ]

# Subset the AnnData object to include only the clusters of interest
adata_subset = adata_combined_barrier_Epi[adata_combined_barrier_Epi.obs['tissue_in_publication'].isin(clusters_of_interest)].copy()

# Ensure clusters are ordered as specified
adata_subset.obs['tissue_in_publication'] = adata_subset.obs['tissue_in_publication'].astype(
    pd.CategoricalDtype(categories=clusters_of_interest, ordered=True)
)

# Create the DotPlot
dotplot = sc.pl.DotPlot(
    adata_subset,
    var_names=correct_order,       # Genes on the x-axis
    groupby='tissue_in_publication',                # Clusters on the y-axis
    standard_scale='var',          # Apply standard scaling across variables
    vmin=0.1,
    vmax=0.8,                       # Set a max value for the scale
    figsize=(9, 4)

)

# Transpose the axes using the swap_axes method
dotplot = dotplot.swap_axes(swap_axes=False)

# Modify the style for grayscale and remove outlines
dotplot = dotplot.style(
    cmap="Greys",               # Use grayscale colormap
    dot_edge_color=None,        # Remove dot outlines
    dot_edge_lw=0,              # No line width for edges
    grid=False,                 # Optional: Disable grid lines
    dot_min=0.1,                # Minimum dot size
    dot_max=0.8,                 # Maximum dot size
)

# Show the plot
dotplot.show()

# Save the plot as SVG with a transparent background
output_path = '/data/vasileiosionat2/IBEX_FINAL/Scimap/Outputs/Figures_paper/Figure4/Tabula_Sapiens_integration.pdf'
dotplot.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata_combined.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata_combined_barrier_Epi[adata_combined_barrier_Epi.obs['leiden'].isin(['2', '5', '8', '10', '9', '0', '4', '6', '13', '1', '3', '11', '12', '7'])]

# Ensure 'leiden' is categorical
adata_subset.obs['leiden'] = adata_subset.obs['leiden'].astype('category')

# Step 2: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 3: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'leiden', method='wilcoxon', use_raw=False)

# Step 4: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['leiden'].cat.categories:
    # Ensure that you're correctly accessing the gene names using a valid index
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 5: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='leiden')

# Step 6: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_leiden']['categories_ordered']

# Step 7: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            # Ensure that the gene is not already added to the unique genes set
            if len(genes_for_this_cluster) >= 20:  # Adjust to your desired number of genes
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 8: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='leiden',  # Ensure 'leiden' is used here for clustering
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
import scanpy as sc
# Start plotting with black background
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['pdf.fonttype'] = 42  # Ensures fonts are embedded as text, not outlines
plt.rcParams['ps.fonttype'] = 42

# Define the subset of clusters you want to include and their desired order
clusters_of_interest = ['2', '5', '8', '10', '9', '0', '4', '6', '13', '1', '3', '11', '12', '7']

# Define the subset of genes you want to plot in the desired order
correct_order = [
   'KRT5', 'COL17A1', 'DST', 'ITGB4', 'CXCL14', 'C1R', 'IL1R2',
      'CAVIN1', 'PDPN', 'BASP1',
'SLC26A2', 'MYC', 'CFH', 'PTN',
'MKI67', 'CDK1', 'AQP3', 'KRT19', 'FGFBP1', 'S100A16', 'LY6D', 'SDC1', 
  'CYP4B1', 'MAMDC2', 'SERPINB3', 'NOD2',
'S100A16', 'SLPI', 'IL1RN', 'ANXA1',
'C15orf48', 'IL36A','CNFN', 'IL36G',
 'SERPINB2', 'EHF',  'ERBB2', 'ODAM', 'ODAPH', 'LAMC2', 'CXCL1', 'CXCL2', 'CXCL6', 'SLPI',
'IL1A', 'IL36G', 'C15orf48', 'ANXA1', 'DNASE1L3',
'COL17A1', 'DST', 'C1R', 'ITGB4', 'SAA1', 'SAA2',
'CYP4B1', 'CFH', 'EHF',  'CLEC7A', 'CFHR3'
]

# Subset the AnnData object to include only the clusters of interest
adata_subset = adata_combined_barrier_Epi[adata_combined_barrier_Epi.obs['leiden'].isin(clusters_of_interest)].copy()

# Ensure clusters are ordered as specified
adata_subset.obs['leiden'] = adata_subset.obs['leiden'].astype(
    pd.CategoricalDtype(categories=clusters_of_interest, ordered=True)
)

# Create the DotPlot
dotplot = sc.pl.DotPlot(
    adata_subset,
    var_names=correct_order,       # Genes on the x-axis
    groupby='leiden',                # Clusters on the y-axis
    standard_scale='var',          # Apply standard scaling across variables
    vmin=0.2,
    vmax=0.8,                       # Set a max value for the scale
    figsize=(20, 4)

)

# Transpose the axes using the swap_axes method
dotplot = dotplot.swap_axes(swap_axes=False)

# Modify the style for grayscale and remove outlines
dotplot = dotplot.style(
    cmap="Greys",               # Use grayscale colormap
    dot_edge_color=None,        # Remove dot outlines
    dot_edge_lw=0,              # No line width for edges
    grid=False,                 # Optional: Disable grid lines
    dot_min=0.1,                # Minimum dot size
    dot_max=0.8,                 # Maximum dot size
)

# Show the plot
dotplot.show()

# Save the plot as SVG with a transparent background
output_path = '/data/vasileiosionat2/IBEX_FINAL/Scimap/Outputs/Figures_paper/Figure4/Xenium_Total_Epi_Lvl5_dotplot.pdf'
#dotplot.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
sc.pl.umap(adata_combined_barrier_Epi, color=['cell_type_integration'])

In [None]:
adata_combined.obs['cell_type_integration'].unique().tolist()

In [None]:
import scanpy as sc

# Assuming you already have the AnnData object `adata`
# Check the leiden clusters and the cell type integration
gingiva_crev_cells = adata_combined[adata_combined.obs['cell_type_integration'] == 'gingiva_Crev']

# Check the leiden cluster assignments for 'gingiva_Crev' cells
gingiva_crev_cells.obs['leiden'].unique().tolist()

In [None]:
import scanpy as sc

# Assuming you already have the AnnData object `adata`
# Check the leiden clusters and the cell type integration
leiden1 = adata_combined[adata_combined.obs['leiden'] == '1']

# Count the number of cells for each unique value in 'cell_type_integration' for the 'leiden' cluster 1
cell_counts = leiden1.obs['cell_type_integration'].value_counts()

# Display the counts
print(cell_counts)


In [None]:
# Visualize merged with UMAP embedding; takes a long time
sc.pp.neighbors(adata_combined, n_neighbors=50, use_rep="X_pca_harmony", metric="correlation")
sc.tl.umap(adata_combined, min_dist=0.5)

In [None]:
adata_ts.obs.columns.unique().tolist()

In [None]:
refTNK.obs

In [None]:
refBPl = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration/BPlasma.h5ad')

In [None]:
refBPl.obs

In [None]:
refBPl.var]

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration/BPlasma.csv"

# Read the CSV into a DataFrame
dfBPl = pd.read_csv(file_path)

# Display the DataFrame
print(dfBPl)

# Access specific columns or rows
#print(dfBPl["ColumnName"])  # Access a column
#print(dfBPl.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfBPl) != refBPl.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfBPl.columns:
    refBPl.obs[column] = dfBPl[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refBPl.obs

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refBPl.obs.columns:
    del refBPl.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refBPl.obs.head())


In [None]:
refBPl.obs.rename(columns={"BPlasmaClusters": "cluster"}, inplace=True)

In [None]:
refBPl.obs

In [None]:
refStr = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration/Stromal.h5ad')

In [None]:
refStr.obs

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration/Stromal.csv"

# Read the CSV into a DataFrame
dfStr = pd.read_csv(file_path)

# Display the DataFrame
print(dfStr)

# Access specific columns or rows
#print(dfStrdfStr["ColumnName"])  # Access a column
#print(dfStr.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfStr) != refStr.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfStr.columns:
    refStr.obs[column] = dfStr[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refStr.obs

In [None]:
refStr.var

In [None]:
refMyel.var

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refStr.obs.columns:
    del refStr.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refStr.obs.head())


In [None]:
refStr.obs.rename(columns={"StromalClusters": "cluster"}, inplace=True)

In [None]:
refStr.obs

In [None]:
refEpi = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration/Epithelial.h5ad')

In [None]:
refEpi.obs

In [None]:
refEpi.var

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration/Epithelial.csv"

# Read the CSV into a DataFrame
dfEpi = pd.read_csv(file_path)

# Display the DataFrame
print(dfEpi)

# Access specific columns or rows
print(dfEpi["ColumnName"])  # Access a column
print(dfEpi.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfEpi) != refEpi.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfEpi.columns:
    refEpi.obs[column] = dfEpi[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refEpi.obs

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refEpi.obs.columns:
    del refEpi.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refEpi.obs.head())


In [None]:
refEpi.obs.rename(columns={"EpithelialClusters": "cluster"}, inplace=True)

In [None]:
refEpi.obs

In [None]:
refMyel = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration/Myeloid.h5ad')

In [None]:
refMyel.obs

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration/Myeloid.csv"

# Read the CSV into a DataFrame
dfMyel = pd.read_csv(file_path)

# Display the DataFrame
print(dfMyel)

# Access specific columns or rows
#print(dfTNK["ColumnName"])  # Access a column
#print(dfTNK.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfMyel) != refMyel.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfMyel.columns:
    refMyel.obs[column] = dfMyel[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refMyel.obs

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refMyel.obs.columns:
    del refMyel.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refMyel.obs.head())


In [None]:
refMyel.obs.rename(columns={"MyeloidClusters": "cluster"}, inplace=True)

In [None]:
refMyel.obs

In [None]:
import anndata as ad

# Concatenate AnnData objects
citeSeq_combined = ad.concat([refTNK, refBPl, refMyel, refStr, refEpi], join='outer')

# Save the combined AnnData object (optional)
#adata_combined.write("combined_adata.h5ad")


In [None]:
citeSeq_combined.obs

In [None]:
citeSeq_combined.var

In [None]:
print(citeSeq_combined.shape)

In [None]:
citeSeq_combined.X

In [None]:
citeSeq_combined.write("/data/vasileiosionat2/Xenium/Integration/citeSeq_combined.h5ad")


In [None]:
ref =  sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration/citeSeq_combined.h5ad')

In [None]:
adata = sc.read_h5ad('/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad')

In [None]:
adata.obs

In [None]:
assay_data = pd.Series('Xenium', index=adata.obs.index)
adata.obs['assay'] = assay_data

merged = adata.concatenate(
    ref, batch_key="assay", batch_categories=["Xenium", "citeSeq"]
)

In [None]:
sc.pp.scale(merged)
+sc.tl.pca(merged)

In [None]:
sc.external.pp.harmony_integrate(merged, key="assay", max_iter_harmony=20, max_iter_kmeans=30)

In [None]:
# Visualize merged with UMAP embedding; takes a long time
sc.pp.neighbors(merged, n_neighbors=50, use_rep="X_pca_harmony", metric="correlation")
sc.tl.umap(merged, min_dist=0.5)

In [None]:
sc.pl.umap(merged, color='assay')

In [None]:
sc.pl.umap(merged, color='cluster')

In [None]:
sc.pl.umap(merged, color='Lvl3')

In [None]:
sc.pl.umap(merged, color='Lvl2')

In [None]:
merged.write("/data/vasileiosionat2/Xenium/Integration/Xenium_citeSeq_harmony.h5ad")


In [None]:
merged = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration/Xenium_citeSeq_harmony.h5ad')

In [None]:
merged.obs

In [None]:
# Transfer annotations from scRNAseq to xenium
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged[merged.obs["assay"] == "citeSeq"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['cluster']) 
labels = nn.predict(merged[merged.obs["assay"] == "Xenium"].obsm["X_pca_harmony"])
merged.obs_names_make_unique()
merged.obs["citeSeq_to_Xenium_label"] = pd.Series(labels, index=merged[merged.obs["assay"] == "Xenium"].obs.index)

In [None]:
sc.pl.umap(merged, color='citeSeq_to_Xenium_label')

In [None]:
# Save transferred annotations to file
citeSeq_to_xenium_predicted_labels = merged.obs.loc[merged.obs['assay'] == 'Xenium', ['orig.ident','Lvl4','citeSeq_to_Xenium_label']]
citeSeq_to_xenium_predicted_labels.index = citeSeq_to_xenium_predicted_labels.index.str.replace('-Xenium','')
citeSeq_to_xenium_predicted_labels.to_csv('/data/vasileiosionat2/Xenium/Integration/citeSeq_to_Xenium_predicted_celltype.csv')

In [None]:
# Transfer annotations from xenium to scRNAseq
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged[merged.obs["assay"] == "Xenium"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['Lvl4']) 
labels = nn.predict(merged[merged.obs["assay"] == "citeSeq"].obsm["X_pca_harmony"])
merged.obs["Xenium_to_citeSeq_label"] = pd.Series(labels, index=merged[merged.obs["assay"] == "citeSeq"].obs.index)

In [None]:
# Save transferred annotations to file
Xenium_to_citeSeq_predicted_labels = merged.obs.loc[merged.obs['assay'] == 'citeSeq', ['orig.ident','cluster','Xenium_to_citeSeq_label']]
Xenium_to_citeSeq_predicted_labels.index = Xenium_to_citeSeq_predicted_labels.index.str.replace('-citeSeq','')
Xenium_to_citeSeq_predicted_labels.to_csv('/data/vasileiosionat2/Xenium/Integration/Xenium_to_citeSeq_predicted_celltype.csv')

In [None]:
# Combine original labels from scSeq and transferred labels from xenium into one column
merged.obs.loc[merged.obs['assay'] == 'Xenium', 'final_label_citeSeq'] = merged.obs[merged.obs['assay'] == 'Xenium']['citeSeq_to_Xenium_label']
merged.obs.loc[merged.obs['assay'] == 'citeSeq', 'final_label_citeSeq'] = merged.obs[merged.obs['assay'] == 'citeSeq']['cluster']
# Combine original labels from xenium and transferred labels from scSeq into one column
# Convert categories to strings (safe option)
merged.obs.loc[merged.obs['assay'] == 'citeSeq', 'final_label_X'] = \
    merged.obs.loc[merged.obs['assay'] == 'citeSeq', 'Xenium_to_citeSeq_label'].astype(str)

merged.obs.loc[merged.obs['assay'] == 'Xenium', 'final_label_X'] = \
    merged.obs.loc[merged.obs['assay'] == 'Xenium', 'Lvl4'].astype(str)


# Visualized merged labels on UMAP
sc.pl.umap(merged, color='final_label_citeSeq')
sc.pl.umap(merged, color='assay')
sc.pl.umap(merged, color='final_label_X')

In [None]:
merged.write("/data/vasileiosionat2/Xenium/Integration/Xenium_citeSeq_harmony.h5ad")

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged[merged.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("Pl")|
    (adata_sample.obs['final_label_citeSeq'].str.startswith("B")),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged[merged.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("CD"),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged[merged.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("Fib"),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged[merged.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("Epi"),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'plot_niches' is a DataFrame or similar data structure
df = pd.DataFrame(merged.obs)
df['Lvl5'] = df['Lvl5'].astype('category')

# Aggregate counts
niche_groups = df.groupby(['final_label_citeSeq', 'niche_cc14']).size().unstack(fill_value=0)

# Identify clusters to remove
clusters_to_remove = df['final_label_citeSeq'].str.contains('Mix') | df['final_label_citeSeq'].str.startswith('Neuro')

# Get the unique clusters to be removed
clusters_to_remove = df.loc[clusters_to_remove, 'final_label_citeSeq'].unique()

# Filter out these clusters from the plot
niche_groups_filtered = niche_groups[~niche_groups.index.isin(clusters_to_remove)]

# Calculate relative frequencies
niche_groups_relative = niche_groups_filtered.div(niche_groups_filtered.sum(axis=1), axis=0)

# Plot heatmap with enhanced colormap
plt.figure(figsize=(16, 40))  # Adjusted figsize to accommodate the x-axis labels
sns.heatmap(niche_groups_relative, cmap='plasma', cbar_kws={'label': 'Relative Frequency'}, linewidths=0.05, linecolor='black')
plt.title('Relative Cell Type Distribution in Niches')
plt.xlabel('Niche')
plt.ylabel('Cell Cluster')
plt.xticks(rotation=45, ha='right')  # Adjust rotation and alignment of xticks
plt.yticks(rotation=0)
plt.tight_layout()  # Ensures all elements fit within the figure area
plt.show()


In [None]:
# Visualize xenium annotations vs transferred scRNAseq annotations
xen_obj = merged[merged.obs['assay']=='Xenium']
xen_obj.obs['Lvl4'] = xen_obj.obs['Lvl4'].astype('str')
celltype_counts = pd.DataFrame(xen_obj.obs.groupby(['Lvl4','citeSeq_to_Xenium_label']).size()).unstack()
celltype_counts.columns = celltype_counts.columns.droplevel()
celltype_counts.index.name = 'Xenium cell type'
celltype_counts.columns.name = 'predicted citeSeq cell type'
celltype_counts = celltype_counts.T
# Row scale co-occurrence frequencies (by predicted scRNAseq cell type)
celltype_counts = celltype_counts.div(celltype_counts.sum(axis=1), axis=0) 
celltype_counts = celltype_counts.loc[:,celltype_counts.idxmax(axis=0).sort_values().index]
celltype_counts = celltype_counts.fillna(0)

In [None]:
column_sums = celltype_counts.abs().sum(axis=0)
print(column_sums)
# Select columns to keep based on the threshold
columns_to_keep = column_sums[column_sums >= 0.1].index
print(columns_to_keep)
celltype_counts2 = celltype_counts[columns_to_keep]
print(celltype_counts2)

In [None]:
plt.figure(figsize = (26,14))
sns.heatmap(celltype_counts2, cmap='YlGnBu')

In [None]:
ref =  sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration/citeSeq_combined.h5ad')

In [None]:
!pip install --user scikit-misc

In [None]:
sc.pp.highly_variable_genes(ref, flavor='seurat_v3', n_top_genes=2000)

In [None]:
genes = ref.var.index[ref.var.highly_variable].union(adata.var.index)

In [None]:
print(genes)

In [None]:
# Read in scRNAseq expression values
# scrnaseq_exp = sc.read('../data/scRNAseq/scanpy_obj/in_vivo_w13_15_subcluster_celltypes.h5ad').to_df()
valid_genes = list(set(genes) & set(ref.var_names))  # Use set intersection for efficiency

In [None]:
citeSeq_exp = ref[:, valid_genes].to_df()
#sc_exp = ref[:,genes].to_df()
citeSeq_exp = sc_exp.loc[:, (sc_exp != 0).any(axis=0)] # Drop genes without any counts

In [None]:
# Transfer scRNAseq expression to xenium 
nn = NearestNeighbors(n_jobs=16)
nn.fit(merged[merged.obs["assay"] == "citeSeq"].obsm["X_pca_harmony"])
dist, inds = nn.kneighbors(merged[merged.obs["assay"] == "Xenium"].obsm["X_pca_harmony"], n_neighbors=1)
predicted_Xenium_exp = pd.DataFrame(sc_exp.iloc[inds.flatten()].to_numpy(), 
                                     index=merged[merged.obs["assay"] == "Xenium"].obs.index, 
                                     columns=citeSeq_exp.columns)

In [None]:
# Save transferred scRNAseq expression to file
predicted_Xenium_exp.to_csv('/data/vasileiosionat2/Xenium/Integration/citeSeq_xen_results.csv')

In [None]:
# Read vFISH results into Scanpy object
adata_citeSeq_Xen = sc.read_csv('/data/vasileiosionat2/Xenium/Integration/citeSeq_xen_results.csv')

In [None]:
# Add spatial coordinates and metadata from original xenium object
adata_citeSeq_Xen.obsm['spatial'] = sc.read('/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad').obsm['spatial']
adata_citeSeq_Xen.obs = sc.read('/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad').obs

In [None]:
# Save vFISH object
adata_citeSeq_Xen.write('/data/vasileiosionat2/Xenium/Drake_outputs/Xen_with_citeSeq_Exp.h5ad')

In [None]:
adata_citeSeq_Xen.uns['spatial'] = {s: {} for s in adata_citeSeq_Xen.obs['sample'].unique()}

In [None]:
# Define your list of desired cell types
celltype_list = celltype_counts2.columns

# Access cell type labels (assuming they are stored in 'cell_type' column)
cell_types = adata.obs['Lvl4']

# Create a boolean mask for filtering
celltype_filter = cell_types.isin(celltype_list)
print(celltype_filter)


In [None]:
# Subset adata object based on the mask
adata_citeSeq_Xen_f = adata_citeSeq_Xen[celltype_filter]
adata_citeSeq_Xen_f[adata_citeSeq_Xen_f.obs['Lvl4'].isin(columns_to_keep)]
adata_citeSeq_Xen_f

In [None]:
# Define your desired cell type order (replace with your actual Lvl4 levels)
celltype_order = celltype_counts2.columns
print(celltype_order)

# Reorder adata.obs based on the custom list
adata_citeSeq_Xen_f.obs["Lvl4_ord"] = pd.Categorical(
    values=adata_citeSeq_Xen_f.obs.Lvl4, categories=celltype_order, ordered=True
)

In [None]:
sc.tl.rank_genes_groups(adata_citeSeq_Xen_f, groupby="Lvl4_ord", method="wilcoxon")

In [None]:
sc.pl.rank_genes_groups_dotplot(
  adata_citeSeq_Xen_f,
  groupby="Lvl4_ord",
  standard_scale="var",
  n_genes=4,
  dendrogram=False
)

In [None]:
import squidpy as sq

sq.pl.spatial_scatter(
    adata_citeSeq_Xen_f, 
    color=['CCL19'], 
    library_key='sample',  
    size=5, 
    img=None,
    spatial_key='spatial',
    palette='tab10',
    #connectivity_key='spatial_connectivities',
    #img_alpha=0.3,
    figsize=(10,10),
    ncols=1,
    library_id=adata.obs['sample'].unique()[9],
    title="SAA1 - Imputed",
    vmax=10
)
sq.pl.spatial_scatter(
    adata, 
    color=['CCL19'], 
    library_key='sample',  
    size=5, 
    img=None,
    spatial_key='spatial',
    palette='tab10',
    #connectivity_key='spatial_connectivities',
    #img_alpha=0.3,
    figsize=(10,10),
    ncols=1,
    library_id=adata.obs['sample'].unique()[9],
    title="SAA - Xenium"
)