In [2]:
import anndata
import cv2
import io
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scanpy as sc
import shutil
import tarfile
import tifffile
from mcDETECT.utils import *

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [3]:
# Read gene panel
utils_dir = "../../data/_utils/"
gene_panel = np.load(utils_dir + "shared_genes.npy").astype(str).tolist()
gene_panel_set = set(gene_panel)

In [4]:
# ==================== Main operations ==================== #

settings = {"Xenium_5K_BC": {"cell_type_label": True},
            "Xenium_5K_OC": {"cell_type_label": True},
            "Xenium_5K_CC": {"cell_type_label": True},
            "Xenium_5K_LC": {"cell_type_label": False},
            "Xenium_5K_Prostate": {"cell_type_label": False},
            "Xenium_5K_Skin": {"cell_type_label": False}}

plot_coords = ["global_x", "global_y"]

DE_analysis = False

for data in settings.keys():
    
    print(f"========== Processing {data}... ==========")
    
    # settings
    available_cell_type_label = settings[data]["cell_type_label"]
    
    # paths
    data_dir = f"../../data/{data}/"
    output_dir = f"../../output/{data}/"
    
    # ==================== Transcripts ==================== #
    
    # Read transcripts
    transcripts = pd.read_parquet(data_dir + "raw_data/transcripts.parquet")
    transcripts = transcripts[transcripts["is_gene"] == True]

    # Process transcripts
    transcripts = transcripts[["cell_id", "overlaps_nucleus", "feature_name", "x_location", "y_location", "z_location"]]
    transcripts = transcripts.rename(columns = {"overlaps_nucleus": "in_nucleus", "feature_name": "target", "x_location": "global_x", "y_location": "global_y", "z_location": "global_z"})
    
    # Keep only shared genes
    transcripts = transcripts[transcripts["target"].isin(gene_panel_set)].copy()
    present = set(transcripts["target"].unique())
    print(f"Unique genes in the transcripts after filtering: {len(present)} / {len(gene_panel)}")

    # Relative position to cell and nucleus
    transcripts["in_cell"] = (transcripts["cell_id"] != "UNASSIGNED").astype(int)
    transcripts["overlaps_nucleus"] = (transcripts["in_nucleus"] == transcripts["in_cell"]).astype(int)

    # Save transcripts
    transcripts = transcripts[["cell_id", "in_nucleus", "overlaps_nucleus", "target", "global_x", "global_y", "global_z"]]
    transcripts.to_parquet(data_dir + "processed_data/transcripts.parquet")
    
    # Summary statistics
    print(f"In-nucleus ratio: {100 * np.sum(transcripts['in_nucleus'] == 1) / transcripts.shape[0]:.2f}%")
    print(f"In-cytoplasm ratio: {100 * np.sum(transcripts['overlaps_nucleus'] == 0) / transcripts.shape[0]:.2f}%")
    
    # Gene-wise in-cytoplasm ratio
    gene_means = transcripts.groupby("target")["overlaps_nucleus"].mean().reset_index()
    gene_means.columns = ["gene", "in_nucleus_ratio"]
    gene_means = gene_means.sort_values(by = "in_nucleus_ratio", ascending = True)
    gene_means["in_cytoplasm_ratio"] = 1 - gene_means["in_nucleus_ratio"]
    gene_means.to_csv(output_dir + "in_cytoplasm_ratio.csv", index = 0)
    
    # ==================== Cells ==================== #
    
    # Read cells
    cells = pd.read_parquet(data_dir + "raw_data/cells.parquet")
    cells.index = cells.index.astype(str)

    # Read adata
    adata_raw = sc.read_10x_h5(data_dir + "raw_data/cell_feature_matrix.h5")
    adata_raw = adata_raw[:, gene_panel].copy()
    adata = anndata.AnnData(X = adata_raw.X, obs = cells)
    adata.obs = adata.obs.rename(columns={"x_centroid": "global_x", "y_centroid": "global_y"})
    adata.var = adata_raw.var.copy()
    adata.var["gene"] = adata.var.index.to_list()
    print(f"Unique genes in the count matrix after filtering: {adata.shape[1]}")
    
    # Determine plot size
    x_range = adata.obs["global_x"].max() - adata.obs["global_x"].min()
    y_range = adata.obs["global_y"].max() - adata.obs["global_y"].min()
    short_edge = min(x_range, y_range)

    scale = 5 / short_edge
    plot_figsize = (int(x_range * scale), int(y_range * scale))
    print(f"Plot size: {plot_figsize}")
    
    # Transform cooordinates to pixel
    M = pd.read_csv(data_dir + "raw_data/HE_alignment.csv", header = None).to_numpy(dtype = float)
    Minv = np.linalg.inv(M)
    
    with open(data_dir + "raw_data/experiment.xenium", "r", encoding="utf-8") as file:
        experiment = json.load(file)
    um_per_xenium_pixel = experiment["pixel_size"]

    x_xenium_pixel = adata.obs["global_x"].to_numpy().astype(float) / um_per_xenium_pixel
    y_xenium_pixel = adata.obs["global_y"].to_numpy().astype(float) / um_per_xenium_pixel
    
    P = np.c_[x_xenium_pixel, y_xenium_pixel, np.ones_like(x_xenium_pixel)]
    Q = (Minv @ P.T).T

    adata.obs["x_pixel"] = Q[:, 0]
    adata.obs["y_pixel"] = Q[:, 1]
    
    # check coordinates mapping on one dataset
    if data == "Xenium_5K_OC":
        
        # read image and convert to BGR
        img = tifffile.imread(data_dir + "raw_data/HE_image.tif")
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        # cell coordinates in pixel
        x = adata.obs["x_pixel"].to_numpy(dtype=float)
        y = adata.obs["y_pixel"].to_numpy(dtype=float)

        h, w = img.shape[:2]
        mask = (x >= 0) & (x < w) & (y >= 0) & (y < h)
        x_in = x[mask].astype(np.int32)
        y_in = y[mask].astype(np.int32)

        # draw cells in blue
        for xi, yi in zip(x_in, y_in):
            cv2.circle(img_bgr, (xi, yi), radius=10, color=(255, 0, 0), thickness=-1)

        # save downsampled annotated image
        shorter_target = 2000
        scale = shorter_target / min(h, w)
        new_w = int(round(w * scale))
        new_h = int(round(h * scale))
        
        img_bgr_small = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
        cv2.imwrite(output_dir + "HE_with_cells_small.png", img_bgr_small)
    
    # Read cell type annotation
    if available_cell_type_label:
        
        print(f"Cell type labels are available for {data}. Proceeding to map cell type labels.")

        cell_type_df = pd.read_csv(data_dir + "raw_data/cell_type_labels.csv")
        cell_type_map = dict(zip(cell_type_df["cell_id"], cell_type_df["group"]))

        adata.obs["cell_type"] = adata.obs["cell_id"].map(cell_type_map)
        print("Number of cells before removing NA:", adata.shape[0])

        adata = adata[~adata.obs["cell_type"].isna()].copy()
        print("Number of cells after removing NA:", adata.shape[0])
        
        # Merge cell types
        with open(utils_dir + "cell_type_dict.pkl", "rb") as file:
            merge_dict = pickle.load(file)

        adata.obs["cell_type_merged"] = adata.obs["cell_type"].map(merge_dict)
        print("Number of cells before removing NA:", adata.shape[0])

        adata = adata[~adata.obs["cell_type_merged"].isna()].copy()
        print("Number of cells after removing NA:", adata.shape[0])
        
        # Plot cell types
        sc.set_figure_params(figsize = plot_figsize)
        ax = sc.pl.scatter(adata, x = plot_coords[0], y = plot_coords[1], color = "cell_type", size = 0.5, title = " ", show = False)
        ax.grid(False)
        plt.savefig(output_dir + "overall_cell_type.jpeg", dpi = 500, bbox_inches = "tight")
        plt.close()

    else:
        
        print(f"Cell type labels are not available for {data}. Manually assigning cell type labels.")
        
        # Read cluster labels
        with tarfile.open(data_dir + "raw_data/analysis.tar.gz", "r:gz") as tar:
            for member in tar.getmembers():
                if member.name == "analysis/clustering/gene_expression_graphclust/clusters.csv" and member.isfile():
                    f = tar.extractfile(member)
                    if f:
                        text = f.read().decode("utf-8")
                        cluster_labels = pd.read_csv(io.StringIO(text))
        
        # Match cluster labels
        num_clusters = np.max(cluster_labels["Cluster"])
        cluster_map = dict(zip(cluster_labels["Barcode"], cluster_labels["Cluster"]))
        adata.obs["cluster_labels"] = adata.obs["cell_id"].map(cluster_map)
        adata.obs["cluster_labels"] = adata.obs["cluster_labels"].fillna(num_clusters + 1)
        adata.obs["cluster_labels"] = adata.obs["cluster_labels"].astype(int).astype(str)
        
        # Plot cell types
        sc.set_figure_params(figsize = plot_figsize)
        ax = sc.pl.scatter(adata, x = plot_coords[0], y = plot_coords[1], color = "cluster_labels", size = 0.5, title = " ", show = False)
        ax.grid(False)
        plt.savefig(output_dir + "cluster_labels.jpeg", dpi = 500, bbox_inches = "tight")
        plt.close()
        
        if DE_analysis:
            
            # Marker gene path
            marker_gene_dir = output_dir + "marker_genes/"
            if os.path.exists(marker_gene_dir):
                shutil.rmtree(marker_gene_dir)
                os.makedirs(marker_gene_dir)
            else:
                os.makedirs(marker_gene_dir)
        
            # DE analysis
            adata_raw_count = adata.copy()
            sc.pp.normalize_total(adata, target_sum = 1e4)
            sc.pp.log1p(adata)
            
            sc.tl.rank_genes_groups(adata, "cluster_labels", method="wilcoxon")
            markers = pd.DataFrame(adata.uns["rank_genes_groups"]["names"]).copy()

            names = adata.uns["rank_genes_groups"]["names"]
            names = pd.DataFrame(names)
            logfc = adata.uns["rank_genes_groups"]["logfoldchanges"]
            logfc = pd.DataFrame(logfc)
            pvals = adata.uns["rank_genes_groups"]["pvals"]
            pvals = pd.DataFrame(pvals)

            for i in [str(i) for i in list(adata.obs["cluster_labels"].unique())]:
                df = {"names": names[i], "logfc": logfc[i], "pvals": pvals[i]}
                df = pd.DataFrame(df)
                df = df[df["logfc"] >= 0]
                df = df[df["pvals"] <= 0.05]
                df = df.sort_values(by = ["logfc"], ascending = False)
                file = marker_gene_dir + i + ".csv"
                df.to_csv(file, index = False)
            
            continue
        
        # Manually assign cell types
        with open(utils_dir + "cell_type_dict_manual.pkl", "rb") as file:
            manual_dict = pickle.load(file)
        cell_dict = manual_dict[data]

        # Create cell type assignments
        adata.obs["cell_type_merged"] = "Unknown"
        for i in cell_dict.keys():
            ind = pd.Series(adata.obs["cluster_labels"]).isin(cell_dict[i])
            adata.obs.loc[ind, "cell_type_merged"] = i
        adata.obs["cell_type_merged"] = pd.Categorical(adata.obs["cell_type_merged"], categories=["Adipocyte", "B cell", "CD4+ T cell", "CD8+ T cell", "Dendritic cell", "Endothelial cell", "Epithelial cell (non-malignant)", "Fibroblast (CAF)", "Lymphatic endothelial cell", "Malignant cell", "Mast cell", "Mesothelial cell", "Myeloid cell", "Pericyte", "Smooth muscle cell", "T cell", "Mixed", "Unknown"], ordered=True)
        adata.obs["cell_type_merged"] = adata.obs["cell_type_merged"].cat.remove_unused_categories()
        
    # Plot merged cell types
    sc.set_figure_params(figsize = plot_figsize)
    ax = sc.pl.scatter(adata, x = plot_coords[0], y = plot_coords[1], color = "cell_type_merged", size = 0.5, title = " ", show = False)
    ax.grid(False)
    plt.savefig(output_dir + "overall_cell_type_merged.jpeg", dpi = 500, bbox_inches = "tight")
    plt.close()
    
    # Save anndata
    adata.write_h5ad(data_dir + "intermediate_data/adata.h5ad")

Unique genes in the transcripts after filtering: 5001 / 5001
In-nucleus ratio: 49.04%
In-cytoplasm ratio: 39.66%
Unique genes in the count matrix after filtering: 5001
Plot size: (5, 7)
Cell type labels are available for Xenium_5K_BC. Proceeding to map cell type labels.
Number of cells before removing NA: 699110
Number of cells after removing NA: 402871
Number of cells before removing NA: 402871
Number of cells after removing NA: 402871
Unique genes in the transcripts after filtering: 5001 / 5001
In-nucleus ratio: 46.58%
In-cytoplasm ratio: 42.38%
Unique genes in the count matrix after filtering: 5001
Plot size: (7, 5)
Cell type labels are available for Xenium_5K_OC. Proceeding to map cell type labels.
Number of cells before removing NA: 407124
Number of cells after removing NA: 327607
Number of cells before removing NA: 327607
Number of cells after removing NA: 327607
Unique genes in the transcripts after filtering: 5001 / 5001
In-nucleus ratio: 41.63%
In-cytoplasm ratio: 45.57%
Uniqu

### H&E image

In [7]:
# # Read image
# img = tifffile.imread(data_dir + "raw_data/HE_image.tif")

# # Resize image
# h, w = img.shape[:2]
# shorter_target = 2000
# scale = shorter_target / min(h, w)
# new_w = int(w * scale)
# new_h = int(h * scale)

# img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
# img_bgr_small = cv2.resize(img_bgr, (new_w, new_h), interpolation = cv2.INTER_AREA)
# cv2.imwrite(data_dir + "intermediate_data/HE_image.png", img_bgr_small)