# DBiT LPC10 RNA+ATAC feature panel for KaroSpace

This notebook builds a compact test panel from DBiT data by selecting **50 RNA HVGs** and the **same genes from ATAC**, where ATAC features are renamed with an `ATAC_` prefix.

Inputs used here:
- `/Volumes/processing2/KaroSpaceDataWrangle/dbit-data/DBiT_Di/LPC10_S1_RNA.h5ad`
- `/Volumes/processing2/KaroSpaceDataWrangle/dbit-data/DBiT_Di/LPC10_S1_ATAC.h5ad`

In [14]:
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy import sparse

RNA_PATH = Path('/Volumes/processing2/KaroSpaceDataWrangle/dbit-data/DBiT_Di/LPC10_S1_RNA.h5ad')
ATAC_PATH = Path('/Volumes/processing2/KaroSpaceDataWrangle/dbit-data/DBiT_Di/LPC10_S1_ATAC.h5ad')
OUT_PATH = Path('/Volumes/processing2/KaroSpaceDataWrangle/dbit-data/DBiT_Di/LPC10_S1_RNA50_plus_ATAC50_karospace.h5ad')

N_GENES = 50
WRITE_OUTPUT = True  # Switch to True to write OUT_PATH

assert RNA_PATH.exists(), f'Missing RNA input: {RNA_PATH}'
assert ATAC_PATH.exists(), f'Missing ATAC input: {ATAC_PATH}'

In [2]:
rna = sc.read_h5ad(RNA_PATH)
atac = sc.read_h5ad(ATAC_PATH)

print('RNA shape :', rna.shape)
print('ATAC shape:', atac.shape)
print('RNA obs columns :', list(rna.obs.columns))
print('ATAC obs columns:', list(atac.obs.columns))
print('RNA var columns :', list(rna.var.columns))
print('ATAC var columns:', list(atac.var.columns))

RNA shape : (8180, 23364)
ATAC shape: (8180, 24030)
RNA obs columns : ['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'clusters', 'colors_map', 'x', 'y']
ATAC obs columns: ['orig.ident', 'nCount_ATAC', 'nFeature_ATAC', 'clusters', 'colors_map', 'x', 'y']
RNA var columns : ['vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable']
ATAC var columns: ['vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable']


In [3]:
if not rna.obs_names.equals(atac.obs_names):
    shared = rna.obs_names.intersection(atac.obs_names)
    if len(shared) == 0:
        raise ValueError('RNA and ATAC have no shared observations.')
    rna = rna[shared].copy()
    atac = atac[shared].copy()
    print(f'Aligned to shared observations: {len(shared)}')

if {'x', 'y'}.issubset(rna.obs.columns):
    rna.obsm['spatial'] = rna.obs[['x', 'y']].to_numpy(dtype=np.float32)
if {'x', 'y'}.issubset(atac.obs.columns):
    atac.obsm['spatial'] = atac.obs[['x', 'y']].to_numpy(dtype=np.float32)

if 'counts' not in rna.layers:
    rna.layers['counts'] = rna.X.copy()
if 'counts' not in atac.layers:
    atac.layers['counts'] = atac.X.copy()

In [5]:
def select_rna_hvgs_present_in_atac(rna_adata: ad.AnnData, atac_adata: ad.AnnData, n_genes: int = 50):
    atac_genes = set(atac_adata.var_names.astype(str))

    if 'vst.variable' in rna_adata.var.columns:
        mask = rna_adata.var['vst.variable'].fillna(False).astype(bool)
        candidates = rna_adata.var.index[mask]
        if 'vst.variance.standardized' in rna_adata.var.columns:
            order = (
                rna_adata.var.loc[candidates, 'vst.variance.standardized']
                .astype(float)
                .sort_values(ascending=False)
                .index
            )
        else:
            order = candidates
    elif 'highly_variable' in rna_adata.var.columns:
        mask = rna_adata.var['highly_variable'].fillna(False).astype(bool)
        candidates = rna_adata.var.index[mask]
        if 'highly_variable_rank' in rna_adata.var.columns:
            order = (
                rna_adata.var.loc[candidates, 'highly_variable_rank']
                .astype(float)
                .sort_values(ascending=True)
                .index
            )
        else:
            order = candidates
    else:
        sc.pp.highly_variable_genes(
            rna_adata,
            n_top_genes=max(2000, n_genes * 10),
            flavor='seurat_v3',
            layer='counts' if 'counts' in rna_adata.layers else None,
        )
        candidates = rna_adata.var.index[rna_adata.var['highly_variable'].fillna(False)]
        order = candidates

    matched = [g for g in order.astype(str) if g in atac_genes]
    selected = matched[:n_genes]

    if len(selected) < n_genes:
        raise ValueError(
            f'Only {len(selected)} RNA HVGs were found in ATAC, requested {n_genes}.'
        )
    return selected


selected_genes = select_rna_hvgs_present_in_atac(rna, atac, n_genes=N_GENES)
selected_genes[:10], len(selected_genes)

(['C1qa',
  'Laptm5',
  'Myo1f',
  'Otx2os1',
  'Lpl',
  'Trem2',
  'C1qc',
  'Csf3r',
  'Cyba',
  'Irf8'],
 50)

In [6]:
rna_sub = rna[:, selected_genes].copy()
atac_sub = atac[:, selected_genes].copy()

rna_sub.var = rna_sub.var.copy()
rna_sub.var['source_modality'] = 'RNA'
rna_sub.var['paired_gene'] = rna_sub.var_names.astype(str)
rna_sub.var['highly_variable'] = True

atac_sub.var = atac_sub.var.copy()
atac_sub.var['source_modality'] = 'ATAC'
atac_sub.var['paired_gene'] = atac_sub.var_names.astype(str)
atac_sub.var['highly_variable'] = True
atac_sub.var_names = [f'ATAC_{g}' for g in atac_sub.var_names.astype(str)]

X_combined = sparse.hstack([rna_sub.X, atac_sub.X], format='csr')
var_combined = pd.concat([rna_sub.var, atac_sub.var], axis=0)
obs_combined = rna_sub.obs.copy()

panel = ad.AnnData(X=X_combined, obs=obs_combined, var=var_combined)
panel.layers['counts'] = panel.X.copy()

if {'x', 'y'}.issubset(panel.obs.columns):
    panel.obsm['spatial'] = panel.obs[['x', 'y']].to_numpy(dtype=np.float32)

panel.uns['panel_info'] = {
    'source_rna': str(RNA_PATH),
    'source_atac': str(ATAC_PATH),
    'n_rna_hvg': N_GENES,
    'n_atac_matched': N_GENES,
    'atac_feature_prefix': 'ATAC_',
}

panel

AnnData object with n_obs × n_vars = 8180 × 100
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'clusters', 'colors_map', 'x', 'y'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'source_modality', 'paired_gene', 'highly_variable'
    uns: 'panel_info'
    obsm: 'spatial'
    layers: 'counts'

In [11]:
print('Panel shape:', panel.shape)
print(panel.var['source_modality'].value_counts())
print('Spatial present:', 'spatial' in panel.obsm)
print(panel.var[['source_modality', 'paired_gene']].head(12))

Panel shape: (8180, 100)
source_modality
RNA     50
ATAC    50
Name: count, dtype: int64
Spatial present: True
        source_modality paired_gene
C1qa                RNA        C1qa
Laptm5              RNA      Laptm5
Myo1f               RNA       Myo1f
Otx2os1             RNA     Otx2os1
Lpl                 RNA         Lpl
Trem2               RNA       Trem2
C1qc                RNA        C1qc
Csf3r               RNA       Csf3r
Cyba                RNA        Cyba
Irf8                RNA        Irf8
Ly86                RNA        Ly86
Cd68                RNA        Cd68


In [20]:
if WRITE_OUTPUT:
    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    panel.write_h5ad(OUT_PATH)
    print(f'Wrote: {OUT_PATH}')
else:
    print('WRITE_OUTPUT is False. Set it to True to write the output file.')

Wrote: /Volumes/processing2/KaroSpaceDataWrangle/dbit-data/DBiT_Di/LPC10_S1_RNA50_plus_ATAC50_karospace.h5ad


In [17]:
panel.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,clusters,colors_map,x,y
CTAAGGTCTTCACGCA,0,7760.575525,17044,C2,#6cfe00,983.0,881.0
ACAGATTCGGAGAACA,0,7621.711619,16094,C2,#6cfe00,1424.0,1322.0
ACACGACCGGTGCGAA,0,7671.569407,15524,C2,#6cfe00,1392.0,1290.0
ATCATTCCGCTCGGTA,0,7746.637712,16411,C2,#6cfe00,1298.0,1353.0
AGCACCTCGAGTTAGC,0,7517.402963,19603,C12,#60e8fc,1487.0,63.0
...,...,...,...,...,...,...,...
CACCTTACAGTGGTCA,0,7268.729916,20901,C8,#e44a2a,1078.0,2896.0
CCTAATCCAGTGGTCA,0,7618.667279,20372,C10,#d900db,1172.0,2896.0
CAAGGAGCAGTGGTCA,0,7161.160035,20642,C12,#60e8fc,1046.0,2896.0
CCGACAACAGTGGTCA,0,7498.618253,20592,C3,#e97dda,1141.0,2896.0


In [19]:
panel.obs['sample_id'] = 'LPC10'