In [None]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
adata_combined_barrier_Epi = sc.read_h5ad("path_to_your_integrated_dataset.h5ad")

In [None]:
#Figure 4D
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['pdf.fonttype'] = 42  # Ensures fonts are embedded as text, not outlines
plt.rcParams['ps.fonttype'] = 42

# Define the subset of clusters you want to include and their desired order
clusters_of_interest = [ 'Skin',
 'Large_Intestine',
 'Small_Intestine',
 'Lung',
 'Tongue',
 'Gingiva-OE',
 'Gingiva-TAE']

# Define the subset of genes you want to plot in the desired order
correct_order = [
   'DCD', 'SEC11C', 'CEACAM5', 'MUC12', 'KRT20', 'PIGR', 'MUC13',
      'MUC17', 'MME', 'SFTPB',
'IFITM3', 'SLPI', 'CXCL17', 'KRT13',
'KRT5', 'KRT6A',  'S100A8', 'S100A9', 'CXCL14', 'KRT4', 'ODAM', 'MMP12', 'FDCSP', 'SAA1', 
  'SAA2', 'DNASE1L3', 'CXCL1', 'CXCL8', ]

# Subset the AnnData object to include only the clusters of interest
adata_subset = adata_combined_barrier_Epi[adata_combined_barrier_Epi.obs['tissue_in_publication'].isin(clusters_of_interest)].copy()

# Ensure clusters are ordered as specified
adata_subset.obs['tissue_in_publication'] = adata_subset.obs['tissue_in_publication'].astype(
    pd.CategoricalDtype(categories=clusters_of_interest, ordered=True)
)

# Create the DotPlot
dotplot = sc.pl.DotPlot(
    adata_subset,
    var_names=correct_order,       # Genes on the x-axis
    groupby='tissue_in_publication',                # Clusters on the y-axis
    standard_scale='var',          # Apply standard scaling across variables
    vmin=0.1,
    vmax=0.8,                       # Set a max value for the scale
    figsize=(9, 4)

)

# Transpose the axes using the swap_axes method
dotplot = dotplot.swap_axes(swap_axes=False)

# Modify the style for grayscale and remove outlines
dotplot = dotplot.style(
    cmap="Greys",               # Use grayscale colormap
    dot_edge_color=None,        # Remove dot outlines
    dot_edge_lw=0,              # No line width for edges
    grid=False,                 # Optional: Disable grid lines
    dot_min=0.1,                # Minimum dot size
    dot_max=0.8,                 # Maximum dot size
)

# Show the plot
dotplot.show()

# Save the plot as SVG with a transparent background
output_path = 'path_to_output_dir.pdf'
#dotplot.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
#Figure 4E. Defining top genes which will subsequently be imported as a list to Enrichr
# Step 1: Ensure all observation names are unique
adata_combined_barrier_Epi.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata_combined_barrier_Epi[adata_combined_barrier_Epi.obs['tissue_in_publication'].isin(['Lung',
 'Tongue',
 'Skin',
 'Large_Intestine',
 'Small_Intestine',
 'Gingiva-OE',
 'Gingiva-TAE'
])]

# Ensure 'leiden' is categorical
adata_subset.obs['tissue_in_publication'] = adata_subset.obs['tissue_in_publication'].astype('category')

# Step 2: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 3: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'tissue_in_publication', method='wilcoxon', use_raw=False)

# Step 4: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['tissue_in_publication'].cat.categories:
    # Ensure that you're correctly accessing the gene names using a valid index
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:30]  # Top 30 genes

# Create a DataFrame where each column corresponds to a tissue and each row contains one of the top 30 genes
df_top_genes = pd.DataFrame.from_dict(top_genes_per_cluster, orient='index').transpose()

# Save to Excel
df_top_genes.to_excel("path_to_output_dir.xlsx", index=False)