# Baysor outputs -> Scanpy h5ad (m50_s4)

This notebook scans two root folders for Baysor outputs with `m50` + `s4`
or `m50` + `scale4` in the folder name (e.g. `m50_s4`, `m50_scale4`). It loads the expression matrix and
cell metadata, builds an AnnData object, and writes `.h5ad` files.


In [29]:
from pathlib import Path
import re
import pandas as pd
import numpy as np
import scanpy as sc

sc.settings.verbosity = 3


## Configure roots
Set the two root folders you want to scan.


In [30]:
root_a = Path("/Volumes/processing2/spinal_cord_injury/data/Data_AJ")
root_b = Path("/Users/christoffer/Downloads/new_spinal_cord_data_CG")

roots = [root_a, root_b]
for r in roots:
    if not r.exists():
        raise FileNotFoundError(f"Root not found: {r}")


## Find Baysor outputs (m50/s4 or m50/scale4)
Searches deep for `segmentation_counts.tsv`, `segmentation.tsv`, or `segmentation.csv`, then checks the folder path
for either `m50` + `s4` or `m50` + `scale4` (including variants like `m_50_s_4`). Also requires
`segmentation_cell_stats.csv` alongside the counts.


In [31]:
import re

def is_target_path(path: Path) -> bool:
    path_str = "/".join(p.name.lower() for p in path.parents) + "/" + path.name.lower()
    # Match m50 + s4, allowing separators like m_50_s_4 or m-50-s-4
    m50_s4 = re.search(r"m\s*[_-]?\s*50", path_str) and re.search(r"s\s*[_-]?\s*4", path_str)
    m50_scale4 = re.search(r"m\s*[_-]?\s*50", path_str) and re.search(r"scale\s*[_-]?\s*4", path_str)
    return bool(m50_s4 or m50_scale4)

def find_baysor_outputs(root: Path):
    outputs = {}
    counts_candidates = ["segmentation.csv", "segmentation_counts.tsv", "segmentation.tsv"]
    for name in counts_candidates:
        for counts_path in root.rglob(name):
            folder = counts_path.parent
            if not is_target_path(folder):
                continue
            stats_path = folder / "segmentation_cell_stats.csv"
            if not stats_path.exists():
                continue
            # Prefer the first (highest-priority) counts file per folder
            outputs.setdefault(folder, {"dir": folder, "counts": counts_path, "stats": stats_path})
    return list(outputs.values())

all_outputs = []
for root in roots:
    all_outputs.extend(find_baysor_outputs(root))

print(f"Found {len(all_outputs)} outputs")
for item in all_outputs:
    print(item["dir"])


Found 50 outputs
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#11/region0/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#11/region1/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#11/region3/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#11/region2/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#5/region0/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#5/region1/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#5/region3/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#5/region2/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#2/region0/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#2/region1/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#2/region3/m_50_s_4
/Volumes/processing2/spinal_cord_injury/data/Data_AJ/Slide#2/region2/m_50_s_4
/Volumes/processing2/spinal_cord_injury/dat

## Helper: build AnnData
Handles both orientations of `segmentation_counts.tsv` (genes x cells or cells x genes).


In [32]:
def load_counts_matrix(counts_path: Path):
    if counts_path.name == "segmentation.csv":
        df = pd.read_csv(counts_path)
        gene_candidates = ["gene", "gene_name", "feature"]
        cell_candidates = ["cell", "cell_id", "cellid", "cell_index"]
        gene_col = next((c for c in gene_candidates if c in df.columns), None)
        cell_col = next((c for c in cell_candidates if c in df.columns), None)
        if gene_col is None or cell_col is None:
            raise ValueError("segmentation.csv is missing gene or cell columns")

        cell_raw = df[cell_col].astype(str)
        cell_numeric = pd.to_numeric(df[cell_col], errors="coerce")
        if cell_numeric.notna().any():
            assigned = cell_numeric.notna() & (cell_numeric > 0)
        else:
            bad = {"0", "-1", "unassigned", "none", "nan", ""}
            assigned = ~cell_raw.str.lower().isin(bad)
        df = df.loc[assigned]

        counts = pd.crosstab(df[cell_col].astype(str), df[gene_col].astype(str))
        obs_names = counts.index.astype(str)
        var_names = counts.columns.astype(str)
        X = counts.to_numpy()
        return X, obs_names, var_names

    df = pd.read_csv(counts_path, sep="	")

    # Heuristic: if the first column looks like gene names, treat as genes x cells
    first_col = df.columns[0]
    if first_col.lower() in {"gene", "genes", "feature", "features"}:
        df = df.set_index(first_col)
        var_names = df.index.astype(str)
        obs_names = df.columns.astype(str)
        X = df.to_numpy()
        # genes x cells -> transpose to cells x genes
        X = X.T
    else:
        # Assume first column is cell IDs
        df = df.set_index(first_col)
        obs_names = df.index.astype(str)
        var_names = df.columns.astype(str)
        X = df.to_numpy()

    return X, obs_names, var_names

def load_cell_stats(stats_path: Path):
    stats = pd.read_csv(stats_path)
    # Try to find a cell id column to index on
    candidates = ["cell", "cell_id", "cellid", "cell_index"]
    col = next((c for c in candidates if c in stats.columns), None)
    if col is not None:
        stats[col] = stats[col].astype(str)
        stats = stats.set_index(col)
    return stats

def build_adata(counts_path: Path, stats_path: Path):
    X, obs_names, var_names = load_counts_matrix(counts_path)
    obs = pd.DataFrame(index=obs_names)
    var = pd.DataFrame(index=var_names)

    stats = load_cell_stats(stats_path)
    if not stats.empty:
        obs = obs.join(stats, how="left")

    adata = sc.AnnData(X=X, obs=obs, var=var)
    return adata


## Build and save h5ad files
Writes one `.h5ad` per Baysor output folder.


In [33]:
def sample_name_from_path(folder: Path) -> str:
    # Expect .../<slide>/<region>/<baysor_folder>
    region = folder.parent.name
    slide = folder.parent.parent.name
    return f"{slide}__{region}"

def safe_name(name: str) -> str:
    return (name
            .replace(' ', '_')
            .replace('#', ''))

out_dir = Path("/Volumes/processing2/output_spinal_cord_injury")
out_dir.mkdir(parents=True, exist_ok=True)

for item in all_outputs:
    sample = sample_name_from_path(item["dir"])
    adata = build_adata(item["counts"], item["stats"])
    adata.obs["sample"] = sample
    adata.obs["source_dir"] = str(item["dir"])
    out_name = safe_name(sample) + "__" + item["dir"].name + ".h5ad"
    out_path = out_dir / out_name
    print(f"Writing {out_path}...")
    adata.write(out_path)

out_dir


Writing /Volumes/processing2/output_spinal_cord_injury/Slide11__region0__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide11__region1__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide11__region3__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide11__region2__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide5__region0__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide5__region1__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide5__region3__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide5__region2__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide2__region0__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide2__region1__m_50_s_4.h5ad...
Writing /Volumes/processing2/output_spinal_cord_injury/Slide2__region3__m_50_s_4.h5ad...
Writing /Volumes/

PosixPath('/Volumes/processing2/output_spinal_cord_injury')

In [34]:
ad = sc.read_h5ad('/Volumes/processing2/output_spinal_cord_injury/slide20__region_5d-2__baysor_m50_scale4.h5ad')

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(81265, 500))