In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

  from pkg_resources import get_distribution, DistributionNotFound


In [None]:
# ==================== Main operations ==================== #

settings = {"Xenium_5K_BC": {"cell_type_label": True},
            "Xenium_5K_OC": {"cell_type_label": True},
            "Xenium_5K_CC": {"cell_type_label": True},
            "Xenium_5K_LC": {"cell_type_label": False},
            "Xenium_5K_Prostate": {"cell_type_label": False},
            "Xenium_5K_Skin": {"cell_type_label": False}}

gene_sets = []

for data in settings.keys():
    
    # read data
    data_dir = f"../../data/{data}/"
    adata_raw = sc.read_10x_h5(data_dir + "raw_data/cell_feature_matrix.h5")
    
    # store gene set
    gene_sets.append(set(adata_raw.var.index))
    
    # free memory early
    del adata_raw

# shared genes across all datasets
shared_genes = sorted(set.intersection(*gene_sets))
pd.DataFrame({"gene": shared_genes}).to_csv("shared_genes.csv", index = False, header = False)
np.save("shared_genes.npy", shared_genes)
print(f"Number of shared genes: {len(shared_genes)}")

Number of shared genes: 5001


In [None]:
# Check overlap with stress granule marker genes
genes = np.load("shared_genes.npy").tolist()

sg_markers_df = pd.read_excel("SG_markers.xlsx")
sg_markers_df = sg_markers_df.sort_values(by = "Fraction of RNA molecules in SGs", ascending = False)

thr = 0.25
sg_marker_genes = sg_markers_df[sg_markers_df["Fraction of RNA molecules in SGs"] > thr]["gene"].to_list()
overlap_genes = [i for i in sg_marker_genes if i in genes]

print(f"Number of SG marker genes (fraction > {thr}): {len(sg_marker_genes)}")
print(f"Number of overlapping genes in the dataset: {len(overlap_genes)}")
print("-" * 30)

Number of SG marker genes (fraction > 0.25): 1470
Number of overlapping genes in the dataset: 479
------------------------------
