# Xenium Human PE (Adult Pulmonary Fibrosis) Processing

This notebook adapts the `Xenium_human_lung.ipynb` workflow for Xenium Ranger-style outputs located in:

- `/Volumes/processing2/human_PE`

Pipeline summary:

1. Discover and load all run folders that contain Xenium Ranger `outs/` files.
2. Merge runs into one AnnData object with per-run metadata.
3. Run QC, filtering, normalization, PCA/UMAP, and Leiden clustering.
4. Add spatial coordinates and inspect spatial cluster structure.
5. Export clustered object and marker-gene tables.


## 1) Imports and plotting defaults

In [None]:
import os
import warnings
from pathlib import Path

# Helps avoid cache-path issues in constrained environments.
os.environ.setdefault('NUMBA_CACHE_DIR', '/tmp/numba_cache')
os.environ.setdefault('MPLCONFIGDIR', '/tmp/matplotlib')

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

sc.settings.verbosity = 2
sc.set_figure_params(dpi=110, facecolor='white')


## 2) Configure input/output paths

In [None]:
BASE_DIR = Path('/Volumes/processing2/human_PE')
OUTPUT_DIR = BASE_DIR / 'derived_scanpy'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

RAW_PATH = OUTPUT_DIR / 'human_pe_raw.h5ad'
CLUSTERED_PATH = OUTPUT_DIR / 'human_pe_clustered.h5ad'
MARKER_TABLE_PATH = OUTPUT_DIR / 'human_pe_markers_leiden_1.0.csv'

print('BASE_DIR:', BASE_DIR)
print('OUTPUT_DIR:', OUTPUT_DIR)


## 3) Discover Xenium Ranger run directories

A valid run folder is any direct child directory of `BASE_DIR` with:

- `outs/cell_feature_matrix.h5`
- `outs/cells.csv.gz`


In [None]:
def find_xenium_runs(base_dir: Path):
    run_paths = []
    for p in sorted(base_dir.iterdir()):
        if not p.is_dir() or p.name.startswith('.') or p.name.startswith('._'):
            continue
        outs = p / 'outs'
        if (outs / 'cell_feature_matrix.h5').exists() and (outs / 'cells.csv.gz').exists():
            run_paths.append(p)
    return run_paths

run_dirs = find_xenium_runs(BASE_DIR)

print(f'Found {len(run_dirs)} run directories:')
for p in run_dirs:
    print(' -', p.name)

if len(run_dirs) == 0:
    raise FileNotFoundError(f'No valid Xenium run directories found under {BASE_DIR}')


## 4) Load each run and align matrix + cell metadata

In [None]:
def derive_sample_id(run_name: str) -> str:
    sample_id = run_name.removesuffix('_xenium_output')
    if sample_id.startswith('output-'):
        sample_id = sample_id[len('output-'):]
    return sample_id


def load_xenium_ranger_run(run_dir: Path) -> sc.AnnData:
    outs = run_dir / 'outs'
    h5_path = outs / 'cell_feature_matrix.h5'
    cells_path = outs / 'cells.csv.gz'

    if not h5_path.exists() or not cells_path.exists():
        missing = [str(p.name) for p in [h5_path, cells_path] if not p.exists()]
        raise FileNotFoundError(f'{run_dir.name}: missing required files: {missing}')

    print(f'Loading {run_dir.name}')
    ad = sc.read_10x_h5(h5_path)

    cell_info = pd.read_csv(cells_path)
    if 'cell_id' not in cell_info.columns:
        cell_info = pd.read_csv(cells_path, index_col=0).reset_index().rename(columns={'index': 'cell_id'})

    cell_info = cell_info.drop_duplicates(subset='cell_id', keep='first').set_index('cell_id')

    shared = ad.obs_names[ad.obs_names.isin(cell_info.index)]
    dropped_from_matrix = ad.n_obs - len(shared)
    dropped_from_cells = cell_info.shape[0] - len(shared)

    if dropped_from_matrix > 0:
        print(f'  - dropping {dropped_from_matrix} matrix barcodes not found in cells.csv.gz')
    if dropped_from_cells > 0:
        print(f'  - ignoring {dropped_from_cells} cells.csv.gz rows not found in matrix')

    ad = ad[shared].copy()
    ad.obs = cell_info.loc[shared].copy()

    run_name = run_dir.name
    sample_id = derive_sample_id(run_name)

    ad.obs['run'] = run_name
    ad.obs['sample_id'] = sample_id
    ad.obs['cell_id'] = shared.astype(str)

    # Guarantee unique obs names across runs after concatenation.
    ad.obs_names = pd.Index([f'{sample_id}:{cid}' for cid in shared], name='obs_id')

    ad.var_names_make_unique()
    return ad


ad_list = []
for run_dir in run_dirs:
    try:
        ad_run = load_xenium_ranger_run(run_dir)
        print(f'  -> loaded {ad_run.n_obs:,} cells x {ad_run.n_vars:,} genes')
        ad_list.append(ad_run)
    except FileNotFoundError as e:
        print('Skipping:', e)

if len(ad_list) == 0:
    raise RuntimeError('No runs were successfully loaded. Check input folder structure.')


## 5) Concatenate runs and preserve raw counts

In [None]:
ad = sc.concat(ad_list, join='outer', merge='same')
ad.var_names_make_unique()

# Keep raw integer-like counts before normalization.
ad.layers['counts'] = ad.X.copy()

print(ad)
ad.obs[['run', 'sample_id', 'cell_id']].head()


## 6) QC metrics and per-sample summary

In [None]:
sc.pp.calculate_qc_metrics(ad, percent_top=None, log1p=False, inplace=True)

qc_summary = (
    ad.obs.groupby('sample_id')
    .agg(
        n_cells=('cell_id', 'count'),
        mean_total_counts=('total_counts', 'mean'),
        median_total_counts=('total_counts', 'median'),
        mean_genes=('n_genes_by_counts', 'mean'),
        median_genes=('n_genes_by_counts', 'median'),
    )
    .sort_values('n_cells', ascending=False)
)

qc_summary


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

ad.obs['total_counts'].hist(bins=100, ax=axes[0])
axes[0].set_title('Total counts per cell')
axes[0].set_xlabel('total_counts')

ad.obs['n_genes_by_counts'].hist(bins=100, ax=axes[1])
axes[1].set_title('Genes detected per cell')
axes[1].set_xlabel('n_genes_by_counts')

plt.tight_layout()


## 7) Save a raw checkpoint

In [None]:
ad.write(RAW_PATH)
print('Wrote:', RAW_PATH)


## 8) Filter low-quality cells, normalize, and log-transform

In [None]:
# Match thresholds used in Xenium_human_lung.ipynb.
sc.pp.filter_cells(ad, min_counts=15)
sc.pp.filter_cells(ad, min_genes=5)

sc.pp.normalize_total(ad, target_sum=100)
sc.pp.log1p(ad)

print(ad)


## 9) PCA, neighborhood graph, UMAP, and Leiden clustering

In [None]:
sc.tl.pca(ad)
sc.pl.pca_variance_ratio(ad, n_pcs=50, log=True)

sc.pp.neighbors(ad, n_neighbors=15, n_pcs=30)
sc.tl.umap(ad, min_dist=0.1)

sc.pl.umap(ad, color=['sample_id'], s=1)


In [None]:
resolutions = [0.1, 0.5, 1.0, 1.5, 2.0]

for resolution in resolutions:
    key = f'leiden_{resolution}'
    if key not in ad.obs.columns:
        sc.tl.leiden(ad, resolution=resolution, key_added=key)
    sc.pl.umap(ad, color=key, legend_loc='on data', frameon=False)


## 10) Add spatial coordinates and inspect spatial cluster maps

In [None]:
ad.obsm['spatial'] = ad.obs[['x_centroid', 'y_centroid']].to_numpy()

# Global map
sc.pl.embedding(ad, basis='spatial', color=['sample_id', 'leiden_1.0'], s=2, frameon=False)


In [None]:
# Per-sample spatial cluster maps
for sid in sorted(ad.obs['sample_id'].unique()):
    ad_sub = ad[ad.obs['sample_id'] == sid].copy()
    sc.pl.embedding(
        ad_sub,
        basis='spatial',
        color='leiden_1.0',
        title=f'{sid} (leiden_1.0)',
        s=2,
        frameon=False,
    )


## 11) Marker genes for cluster interpretation

In [None]:
cluster_key = 'leiden_1.0'

sc.tl.rank_genes_groups(ad, groupby=cluster_key, method='t-test')
sc.pl.rank_genes_groups(ad, n_genes=25, sharey=False)

markers = sc.get.rank_genes_groups_df(ad, group=None)
markers = markers.sort_values(['group', 'logfoldchanges'], ascending=[True, False])
markers.head()


In [None]:
top_markers = markers.groupby('group').head(30)
top_markers.to_csv(MARKER_TABLE_PATH, index=False)
print('Wrote marker table:', MARKER_TABLE_PATH)


## 12) Save clustered AnnData

In [None]:
ad.write(CLUSTERED_PATH)
print('Wrote:', CLUSTERED_PATH)
