In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [5]:
# ==================== Main operations ==================== #

settings = {"Xenium_5K_BC": {"cell_type_label": True},
            "Xenium_5K_OC": {"cell_type_label": True},
            "Xenium_5K_CC": {"cell_type_label": True},
            "Xenium_5K_LC": {"cell_type_label": False},
            "Xenium_5K_Prostate": {"cell_type_label": False},
            "Xenium_5K_Skin": {"cell_type_label": False}}

for data in settings.keys():
    
    print(f"========== Processing {data}... ==========")
    
    # paths
    data_dir = f"../../data/{data}/"
    utils_dir = "../../data/utils/"
    output_dir = f"../../output/{data}/"
    
    # Read data
    genes = pd.read_csv(data_dir + "processed_data/genes.csv")
    genes = list(genes.iloc[:, 0])

    adata = sc.read_h5ad(data_dir + "intermediate_data/adata.h5ad")
    adata_tumor = adata[adata.obs["cell_type_merged"] == "Malignant cell"].copy()
    transcripts = pd.read_parquet(data_dir + "processed_data/transcripts.parquet")
    
    # Construct nuclear expression matrix
    cell_ids = list(adata_tumor.obs["cell_id"])
    gene_ids = list(adata_tumor.var.index)

    nuclear_df = transcripts[transcripts["in_nucleus"] == 1]
    filtered_df = nuclear_df[nuclear_df["cell_id"].isin(cell_ids) & nuclear_df["target"].isin(gene_ids)]
    count_series = filtered_df.groupby(["cell_id", "target"]).size()
    count_matrix = count_series.unstack(fill_value = 0)
    count_matrix = count_matrix.reindex(index = cell_ids, columns = gene_ids, fill_value = 0)

    X = sparse.csr_matrix(count_matrix.to_numpy())
    sparse.save_npz(data_dir + "processed_data/nuclear_expression_matrix.npz", X)
    print(f"Shape of the nuclear expression matrix: {X.shape}")
    
    # Construct cytoplasmic expression matrix
    cyto_df = transcripts[transcripts["overlaps_nucleus"] == 0]
    filtered_df = cyto_df[cyto_df["cell_id"].isin(cell_ids) & cyto_df["target"].isin(gene_ids)]
    count_series = filtered_df.groupby(["cell_id", "target"]).size()
    count_matrix = count_series.unstack(fill_value = 0)
    count_matrix = count_matrix.reindex(index = cell_ids, columns = gene_ids, fill_value = 0)

    X = sparse.csr_matrix(count_matrix.to_numpy())
    sparse.save_npz(data_dir + "processed_data/cytoplasmic_expression_matrix.npz", X)
    print(f"Shape of the cytoplasmic expression matrix: {X.shape}")
    
    # Save cell_ids and gene_ids for future checking
    np.save(data_dir + "processed_data/cell_ids.npy", np.array(cell_ids, dtype = object))
    np.save(data_dir + "processed_data/gene_ids.npy", np.array(gene_ids, dtype = object))

Shape of the nuclear expression matrix: (102180, 5101)
Shape of the cytoplasmic expression matrix: (102180, 5101)
Shape of the nuclear expression matrix: (160250, 5101)
Shape of the cytoplasmic expression matrix: (160250, 5101)
Shape of the nuclear expression matrix: (221355, 5101)
Shape of the cytoplasmic expression matrix: (221355, 5101)
Shape of the nuclear expression matrix: (44624, 5001)
Shape of the cytoplasmic expression matrix: (44624, 5001)
Shape of the nuclear expression matrix: (95429, 5006)
Shape of the cytoplasmic expression matrix: (95429, 5006)
Shape of the nuclear expression matrix: (49126, 5006)
Shape of the cytoplasmic expression matrix: (49126, 5006)
