In [13]:
import pandas as pd
import anndata as ad
import scanpy as sc
import os

In [14]:
# Define the paths to your Xenium output files
output_dir = "/home/xilab/paper-data-analysis/2025.01.22_Nature_PMID39843748_Tissue-resident_memory_CD8_T_cell_diversity_is_spatiotemporally_imprinted/paper-data/Spatial-TRM-paper/data/raw_data/spatial_raw_compressed_data/Spatial_Perturb/output-XETG00341__0032977__perturb1_SI2_AG0084__20240808__215945"
cells_file = os.path.join(output_dir, "cells.parquet") # Or "cells.csv.gz"
h5_file = os.path.join(output_dir, "cell_feature_matrix.h5")
output_csv = "xenium_cell_expression_coordinates.csv"

In [15]:
# --- 1. Load cell coordinates and metadata ---
# Use pandas to read the Parquet file (efficient for large data)
# If using CSV.gz, use: cells_df = pd.read_csv(cells_file)
try:
    cells_df = pd.read_parquet(cells_file)
except FileNotFoundError:
    print(f"{cells_file} not found. Check path or use the CSV.gz version.")
    exit()
# The cells_df contains cell_id, x_centroid, y_centroid, and other metrics

In [16]:
# --- 2. Load gene expression matrix ---
# The .h5 file is an AnnData object, a standard format for single-cell data
# adata = ad.read_h5ad(h5_file)
adata = sc.read_10x_h5(h5_file)
# The matrix only includes transcripts with a Q-Score >= 20 and assigned to cells

In [17]:
adata.var_names.tolist()

['Acta2',
 'Adam12',
 'Adam15',
 'Adam17',
 'Adam2',
 'Adam23',
 'Adam9',
 'Adamdec1',
 'Adgre1',
 'Aicda',
 'Aif1',
 'Aldh1a1',
 'Anxa2',
 'Apoe',
 'Atp1b1',
 'Avil',
 'Axin2',
 'Btg1',
 'C3',
 'Cacna1a',
 'Cacnb2',
 'Ccl11',
 'Ccl20',
 'Ccl25',
 'Ccl5',
 'Ccn1',
 'Ccn2',
 'Ccr5',
 'Ccr6',
 'Ccr7',
 'Ccr9',
 'Cd19',
 'Cd226',
 'Cd24a',
 'Cd27',
 'Cd274',
 'Cd28',
 'Cd34',
 'Cd3e',
 'Cd4',
 'Cd44',
 'Cd69',
 'Cd7',
 'Cd70',
 'Cd74',
 'Cd79a',
 'Cd80',
 'Cd81',
 'Cd83',
 'Cd86',
 'Cd8a',
 'Cd8b1',
 'Cdc20',
 'Cdca8',
 'Cdh1',
 'Cenpp',
 'Cfap46',
 'Chad',
 'Chga',
 'Chgb',
 'Chn2',
 'Chrm3',
 'Clca4a',
 'Clec1b',
 'Clec9a',
 'Col18a1',
 'Col1a1',
 'Col2a1',
 'Col3a1',
 'Col4a1',
 'Comp',
 'Crip1',
 'Csf1r',
 'Cst7',
 'Ctla4',
 'Ctsb',
 'Cx3cl1',
 'Cx3cr1',
 'Cxcl10',
 'Cxcl12',
 'Cxcl9',
 'Cxcr3',
 'Cybb',
 'Cyp2c55',
 'Dapl1',
 'Dclk1',
 'Dlc1',
 'Dlk1',
 'Dll1',
 'Dll3',
 'Dll4',
 'Dock10',
 'Dpp4',
 'Dusp10',
 'Dusp18',
 'E2f1',
 'Ebf1',
 'Egr1',
 'Epcam',
 'F11r',
 'Fabp2',
 'Fbxo32

In [18]:
# --- 3. Extract partial gene expression (e.g., specific genes) ---
# Convert the sparse matrix to a dense DataFrame for easier merging
expression_df = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=adata.var_names)
# # Optional: Filter for specific genes if you don't need all of them
# selected_genes = ['Dclk1', 'Epcam', 'Acta2', 'Neurog3', 'Pecam1', 'S100a4'] 
# expression_df = expression_df[selected_genes]

In [19]:
expression_df['total_counts'] = expression_df.sum(axis=1)

In [20]:
# --- 4. Merge coordinates and expression data ---
# Ensure cell IDs match for merging
cells_df.set_index('cell_id', inplace=True)
# Filter coordinates to only include cells present in the expression matrix
cells_df = cells_df.loc[expression_df.index]

# Concatenate the dataframes along the columns (axis=1)
merged_df = pd.concat([cells_df[['x_centroid', 'y_centroid']], expression_df], axis=1)


In [21]:
merged_df.head()

Unnamed: 0,x_centroid,y_centroid,Acta2,Adam12,Adam15,Adam17,Adam2,Adam23,Adam9,Adamdec1,...,Vim,Vps37b,Vtn,Vwf,Xcl1,Xcr1,Xist,Zdhhc14,Zp3,total_counts
aaaadhib-1,806.861938,3222.780762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133.0
aaaafpil-1,804.078857,3230.00708,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,177.0
aaaaglcb-1,811.722656,3227.608154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0
aaaakfkn-1,818.528687,3232.370361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
aaaaldod-1,812.164185,3248.871582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0


In [22]:
merged_df.shape

(297545, 353)

In [23]:
merged_df = merged_df[merged_df['total_counts'] > 80]
merged_df.shape

(147477, 353)

In [24]:
merged_df_renamed = merged_df.rename(columns={
    'x_centroid': 'x',
    'y_centroid': 'y'
})

merged_df_renamed['z'] = 1

# --- 5. Save the result to a CSV file ---
merged_df_renamed.to_csv(output_csv, index=True, index_label="cell_id")

print(f"Successfully extracted data and saved to {output_csv}")
print(f"Dimensions of output file: {merged_df.shape}")

Successfully extracted data and saved to xenium_cell_expression_coordinates.csv
Dimensions of output file: (147477, 353)
