# Concatenate Baysor outputs (outer join)

This notebook reads all per-sample outputs from a directory and concatenates them into one AnnData with an outer join on variables.

**Note on memory:** concatenation creates a single in-memory AnnData. If you run out of RAM, consider writing a subset first, or using Zarr-based workflows.


In [3]:
from pathlib import Path
import anndata as ad
import scanpy as sc

# ---- Parameters ----
INPUT_DIR = Path('/Volumes/processing2/output_spinal_cord_injury')
PATTERN = '**/*.h5ad'  # adjust if needed
OUTPUT_PATH = INPUT_DIR / 'spinal_cord_injury_merged_outer.h5ad'

# For reproducibility in concat
OBS_KEY = 'batch'  # name for source file label in .obs

print('Input dir:', INPUT_DIR)
print('Output path:', OUTPUT_PATH)


Input dir: /Volumes/processing2/output_spinal_cord_injury
Output path: /Volumes/processing2/output_spinal_cord_injury/spinal_cord_injury_merged_outer.h5ad


In [4]:
# Discover files
# Filters help remove hidden files and obvious duplicates.
EXCLUDE_HIDDEN = True
EXCLUDE_DIRS = {'.ipynb_checkpoints'}
EXCLUDE_SUFFIXES = {'.tmp', '.bak'}

all_files = sorted(INPUT_DIR.glob(PATTERN))
if not all_files:
    raise FileNotFoundError(f'No files found for pattern: {PATTERN}')

def is_hidden(path: Path) -> bool:
    return any(part.startswith('.') for part in path.parts)

filtered = []
for f in all_files:
    if EXCLUDE_HIDDEN and is_hidden(f):
        continue
    if any(part in EXCLUDE_DIRS for part in f.parts):
        continue
    if f.suffix in EXCLUDE_SUFFIXES:
        continue
    filtered.append(f)

# Deduplicate by resolved path (just in case)
seen = set()
files = []
for f in filtered:
    rp = f.resolve()
    if rp in seen:
        continue
    seen.add(rp)
    files.append(f)

print(f'Found {len(all_files)} files before filtering')
print(f'Using {len(files)} files after filtering')
for f in files[:10]:
    print(f)
if len(files) > 10:
    print('...')

# Optional: flag duplicate stems (same filename in different dirs)
from collections import Counter
stem_counts = Counter(f.stem for f in files)
dup_stems = [s for s, c in stem_counts.items() if c > 1]
if dup_stems:
    print('Duplicate stems found (same filename in multiple dirs):')
    for s in dup_stems:
        print('  ', s)


Found 132 files before filtering
Using 66 files after filtering
/Volumes/processing2/output_spinal_cord_injury/Data_AJ__Slide1__m50_s4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Data_AJ__Slide3__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide11__region0__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide11__region1__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide11__region2__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide11__region3__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide12__region0__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide12__region1__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide12__region2__m_50_s_4.h5ad
/Volumes/processing2/output_spinal_cord_injury/Slide12__region3__m_50_s_4.h5ad
...


In [5]:
# Read and concatenate
# If memory is tight, reduce the file set or try concatenating in smaller batches.
adatas = []
for f in files:
    adata = sc.read_h5ad(f)
    # Track origin for downstream grouping
    adata.obs[OBS_KEY] = f.stem
    adatas.append(adata)

adata_merged = ad.concat(
    adatas,
    join='outer',
    label=OBS_KEY,
    keys=[f.stem for f in files],
    index_unique='-'
)

adata_merged


AnnData object with n_obs × n_vars = 3205800 × 500
    obs: 'x', 'y', 'z', 'cluster', 'n_transcripts', 'density', 'elongation', 'area', 'avg_confidence', 'avg_assignment_confidence', 'max_cluster_frac', 'lifespan', 'sample', 'source_dir', 'batch'

In [6]:
# Write output (compressed)
adata_merged.write_h5ad(OUTPUT_PATH, compression='gzip')
print('Wrote:', OUTPUT_PATH)


Wrote: /Volumes/processing2/output_spinal_cord_injury/spinal_cord_injury_merged_outer.h5ad


In [10]:
adata_merged.X.sum(axis=1).mean()

np.float64(nan)

In [12]:
adata_merged.X.sum(axis=1)

array([ 69., 257., 116., ...,  86.,  28.,  18.], shape=(3205800,))

In [13]:
adata_merged.X = adata_merged.X.fillna(0)

AttributeError: 'numpy.ndarray' object has no attribute 'fillna'

In [15]:
X = adata_merged.X

In [17]:
import numpy as np

In [18]:
X[np.isnan(X)] = 0

In [20]:
adata_merged.X =X

In [21]:
adata_merged.X.sum(axis = 1).mean()

np.float64(55.64957982406887)

In [None]:
adata_merged.write_h5ad(OUTPUT_PATH, compression='gzip')
print('Wrote:', OUTPUT_PATH)