In [14]:
import anndata
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [15]:
# Specify data, setting, and paths
settings = {"Xenium_5K_BC": {"x_shift": 0, "y_shift": 7000},
            "Xenium_5K_OC": {"x_shift": 12000, "y_shift": 10000},
            "Xenium_5K_CC": {"x_shift": 26000, "y_shift": 8000},
            "Xenium_5K_LC": {"x_shift": 12000, "y_shift": 0},
            "Xenium_5K_Prostate": {"x_shift": 26000, "y_shift": 1000},
            "Xenium_5K_Skin": {"x_shift": 0, "y_shift": 1000}}

output_dir = "../../output/merged_data/"
save_dir = "../../data/merged_data/"

In [16]:
# Construct merged data
transcripts_dict = {}
adata_dict = {}

for data in settings.keys():
    
    # Read data
    data_dir = f"../../data/{data}/"
    adata = sc.read_h5ad(data_dir + "intermediate_data/adata.h5ad")
    print(f"Number of genes: {adata.n_vars}")
    
    # Delete uns
    del adata.uns
    
    # Shift coordinates
    x_shift = settings[data]["x_shift"]
    y_shift = settings[data]["y_shift"]
    adata.obs["global_x"] = adata.obs["global_x"] + x_shift
    adata.obs["global_y"] = adata.obs["global_y"] + y_shift
    
    # Select tumor cells
    adata_tumor = adata[adata.obs["cell_type_merged"] == "Malignant cell"].copy()
    
    # Read transcripts and select those within tumor cells
    transcripts = pd.read_parquet(data_dir + "processed_data/transcripts.parquet")
    transcripts_tumor = transcripts[transcripts["cell_id"].isin(adata_tumor.obs["cell_id"])]
    print(f"{transcripts_tumor.shape[0]} out of {transcripts.shape[0]} transcripts in tumor cells.")
    
    # Store data
    adata_dict[data] = adata
    transcripts_dict[data] = transcripts_tumor

adata = anndata.concat(adata_dict, axis = 0, merge = "same", label = "batch")
adata

Number of genes: 5001
35641064 out of 87146596 transcripts in tumor cells.
Number of genes: 5001
81798615 out of 129043380 transcripts in tumor cells.
Number of genes: 5001
43489382 out of 90482395 transcripts in tumor cells.
Number of genes: 5001
71378586 out of 176649914 transcripts in tumor cells.
Number of genes: 5001
32109068 out of 59566887 transcripts in tumor cells.
Number of genes: 5001
37837245 out of 73302888 transcripts in tumor cells.


AnnData object with n_obs × n_vars = 1903003 × 5001
    obs: 'cell_id', 'global_x', 'global_y', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'cell_type_merged', 'batch'
    var: 'gene_ids', 'feature_types', 'genome', 'gene'

In [17]:
# Overall in-cytoplasm ratio in tumor cells
transcripts_all = pd.concat(transcripts_dict.values(), axis = 0)
gene_means = transcripts_all.groupby("target")["overlaps_nucleus"].mean().reset_index()
gene_means.columns = ["gene", "in_nucleus_ratio"]
gene_means = gene_means.sort_values(by = "in_nucleus_ratio", ascending = True)
gene_means["in_cytoplasm_ratio"] = 1 - gene_means["in_nucleus_ratio"]
gene_means.to_csv(output_dir + "in_cytoplasm_ratio_tumor.csv", index = 0)

In [12]:
# Determine plot size
x_range = adata.obs["global_x"].max() - adata.obs["global_x"].min()
y_range = adata.obs["global_y"].max() - adata.obs["global_y"].min()
short_edge = min(x_range, y_range)

scale = 10 / short_edge
plot_figsize = (int(x_range * scale), int(y_range * scale))
print(f"Plot size: {plot_figsize}")

Plot size: (16, 10)


In [13]:
# Plot batch
sc.set_figure_params(figsize = plot_figsize)
ax = sc.pl.scatter(adata, x="global_x", y="global_y", color="batch", size=0.5, show=False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("")
for spine in ax.spines.values():
    spine.set_visible(False)
plt.savefig(output_dir + "batch.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

# Plot cell type
sc.set_figure_params(figsize = plot_figsize)
ax = sc.pl.scatter(adata, x="global_x", y="global_y", color="cell_type_merged", size=0.5, show=False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("")
for spine in ax.spines.values():
    spine.set_visible(False)
plt.savefig(output_dir + "cell_type_merged.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

In [14]:
adata.write_h5ad(save_dir + "adata_all_raw.h5ad")