In [26]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [27]:
# Specify data, setting, and paths
settings = {"Xenium_5K_BC": {"coords": ["global_x", "global_y"], "figsize": (5, 8)},
            "Xenium_5K_OC": {"coords": ["global_y", "global_x"], "figsize": (5, 7)}}

data = "Xenium_5K_OC"
plot_figsize = settings[data]["figsize"]
plot_coords = settings[data]["coords"]

data_dir = f"../../data/{data}/"
utils_dir = "../../data/utils/"
output_dir = f"../../output/{data}/"

In [28]:
# Read data
genes = pd.read_csv(data_dir + "processed_data/genes.csv")
genes = list(genes.iloc[:, 0])

adata_tumor = sc.read_h5ad(data_dir + "processed_data/adata_tumor.h5ad")
transcripts = pd.read_parquet(data_dir + "processed_data/transcripts.parquet")

In [29]:
# Construct nuclear expression matrix
cell_ids = list(adata_tumor.obs["cell_id"])
gene_ids = list(adata_tumor.var.index)

nuclear_df = transcripts[transcripts["in_nucleus"] == 1]
filtered_df = nuclear_df[nuclear_df["cell_id"].isin(cell_ids) & nuclear_df["target"].isin(gene_ids)]
count_series = filtered_df.groupby(["cell_id", "target"]).size()
count_matrix = count_series.unstack(fill_value = 0)
count_matrix = count_matrix.reindex(index = cell_ids, columns = gene_ids, fill_value = 0)

X = sparse.csr_matrix(count_matrix.to_numpy())
sparse.save_npz(data_dir + "processed_data/nuclear_expression_matrix.npz", X)
X.shape

(160250, 5101)

In [30]:
# Construct cytoplasmic expression matrix
cyto_df = transcripts[transcripts["overlaps_nucleus"] == 0]
filtered_df = cyto_df[cyto_df["cell_id"].isin(cell_ids) & cyto_df["target"].isin(gene_ids)]
count_series = filtered_df.groupby(["cell_id", "target"]).size()
count_matrix = count_series.unstack(fill_value = 0)
count_matrix = count_matrix.reindex(index = cell_ids, columns = gene_ids, fill_value = 0)

X = sparse.csr_matrix(count_matrix.to_numpy())
sparse.save_npz(data_dir + "processed_data/cytoplasmic_expression_matrix.npz", X)
X.shape

(160250, 5101)