# Baysor outputs -> Scanpy h5ad (m20_s31)

This notebook scans section folders for Baysor outputs in `m20_s31` and writes one h5ad per section.


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import scanpy as sc

sc.settings.verbosity = 3


## Configure roots


In [3]:
data_root = Path("/Volumes/processing2/nature-dev-mouse-reanalysis/data")
baysor_subdir = "m20_s31"

out_dir = data_root / "h5ad"
out_dir.mkdir(parents=True, exist_ok=True)


## Find Baysor outputs (m20_s31)


In [20]:
def has_dot_underscore(path: Path) -> bool:
    return any(part.name.startswith('._') for part in (path, *path.parents))

def find_baysor_outputs(root: Path):
    outputs = []
    for section_dir in sorted(p for p in root.iterdir() if p.is_dir() and p.name.startswith('section_')):
        baysor_dir = section_dir / baysor_subdir
        if not baysor_dir.exists():
            continue
        if has_dot_underscore(baysor_dir):
            continue
        counts_candidates = [
            'segmentation_segmentation.csv',
            'segmentation.csv',
            'segmentation_counts.tsv',
            'segmentation.tsv',
        ]
        counts_path = None
        for name in counts_candidates:
            p = baysor_dir / name
            if p.exists():
                counts_path = p
                break
        stats_path = baysor_dir / 'segmentation_cell_stats.csv'
        if counts_path is None or not stats_path.exists():
            continue
        outputs.append({
            'section': section_dir.name,
            'dir': baysor_dir,
            'counts': counts_path,
            'stats': stats_path,
        })
    return outputs

all_outputs = find_baysor_outputs(data_root)
print(f"Found {len(all_outputs)} outputs")
for item in all_outputs:
    print(item['dir'])


Found 27 outputs
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_02A/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_02B/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_02C/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_02D/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_02E/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_02F/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_06A/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_06B/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_06C/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_06D/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_06E/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_06F/m20_s31
/Volumes/processing2/nature-dev-mouse-reanalysis/data/section_10A/m20_s31
/Volumes/processing2/

## Helper: build AnnData


In [21]:
def load_counts_matrix(counts_path: Path) -> pd.DataFrame:
    if counts_path.suffix == '.csv':
        df = pd.read_csv(counts_path)
        df = df.dropna()
        if 'cell' not in df.columns or 'gene' not in df.columns:
            raise ValueError('segmentation CSV is missing cell or gene columns')
        df['cell'] = df['cell'].astype(str)
        df['gene'] = df['gene'].astype(str)
        count_mat = pd.crosstab(df['cell'], df['gene'])
        return count_mat

    df = pd.read_csv(counts_path, sep='	')
    first_col = df.columns[0]
    if first_col.lower() in {'gene', 'genes', 'feature', 'features'}:
        df = df.set_index(first_col)
        var_names = df.index.astype(str)
        obs_names = df.columns.astype(str)
        X = df.to_numpy().T
    else:
        df = df.set_index(first_col)
        obs_names = df.index.astype(str)
        var_names = df.columns.astype(str)
        X = df.to_numpy()

    return pd.DataFrame(X, index=obs_names, columns=var_names)

def load_cell_stats(stats_path: Path) -> pd.DataFrame:
    stats = pd.read_csv(stats_path)
    if 'cell' not in stats.columns:
        raise ValueError('segmentation_cell_stats.csv is missing cell column')
    stats['cell'] = stats['cell'].astype(str)
    stats = stats.set_index('cell')
    return stats

def build_adata(counts_path: Path, stats_path: Path):
    count_mat = load_counts_matrix(counts_path)
    stats = load_cell_stats(stats_path)
    stats = stats.reindex(count_mat.index)
    adata = sc.AnnData(X=count_mat)
    adata.obs = stats
    return adata


## Build and save h5ad files


In [22]:
def safe_name(name: str) -> str:
    return (name
            .replace(' ', '_')
            .replace('#', ''))

for item in all_outputs:
    section = item['section']
    adata = build_adata(item['counts'], item['stats'])
    adata.obs['section'] = section
    adata.obs['source_dir'] = str(item['dir'])
    out_name = safe_name(section) + '__' + item['dir'].name + '.h5ad'
    out_path = out_dir / out_name
    print(f'Writing {out_path}...')
    adata.write(out_path)

out_dir


Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_02A__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_02B__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_02C__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_02D__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_02E__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_02F__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_06A__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_06B__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_06C__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad/section_06D__m20_s31.h5ad...
Writing /Volumes/processing2/nature-dev-

PosixPath('/Volumes/processing2/nature-dev-mouse-reanalysis/data/h5ad')