In [5]:
import os
import re
import glob
import time
import math
import shutil
import subprocess
import tempfile
from pathlib import Path

import numpy as np
import pandas as pd
import miniball

print("CWD:", os.getcwd())
print("baysor on PATH:", shutil.which("baysor"))
print("Julia:", subprocess.check_output(["julia", "--version"]).decode().strip())

CWD: /Users/chenyang/Desktop/mcDETECT/simulation
baysor on PATH: /usr/local/bin/baysor
Julia: julia version 1.10.10


### User configuration

In [6]:
# Roots relative to this notebook's working directory
SIM_DATA_ROOT = "simulated_data"
BAYSOR_OUT_ROOT = "baysor_outputs"

# Baysor parameters (tune once then freeze)
DEFAULT_MIN_MOLS = 20
DEFAULT_SCALE = 3.0

# Julia threads
DEFAULT_THREADS = 16

# Resume logic: skip if baysor_spheres.csv already exists
RESUME_IF_DONE = True

# Optional: also store Baysor raw segmentation.csv and plots (kept in out_dir by Baysor anyway)
KEEP_BAYSOR_PLOTS = True  # Baysor "--plot" creates plots in out_dir

# Debug: run only a subset
LIMIT_N = None  # e.g., 10 for first 10, or None for all

print("SIM_DATA_ROOT:", SIM_DATA_ROOT)
print("BAYSOR_OUT_ROOT:", BAYSOR_OUT_ROOT)

SIM_DATA_ROOT: simulated_data
BAYSOR_OUT_ROOT: baysor_outputs


### Discover simulated CSVs (single_marker + multi_marker)

In [None]:
def discover_simulated_data(sim_root: str) -> pd.DataFrame:
    
    rows = []

    # single_marker/{dimension}/{name}/seed_{seed}.csv
    pattern_single = os.path.join(sim_root, "single_marker", "*", "*", "seed_*.csv")
    for f in sorted(glob.glob(pattern_single)):
        parts = f.split(os.sep)
        dimension = parts[-3]
        name = parts[-2]
        m = re.search(r"seed_(\d+)\.csv$", parts[-1])
        if not m:
            continue
        seed = int(m.group(1))

        rows.append({
            "mode": "single_marker",
            "dimension": dimension,
            "scenario": name,
            "seed": seed,
            "is_3d": ("3D" in dimension.upper()),
            "input_csv": f,
        })

    # multi_marker/{dimension}/all/seed_{seed}.csv
    pattern_multi = os.path.join(sim_root, "multi_marker", "*", "all", "seed_*.csv")
    for f in sorted(glob.glob(pattern_multi)):
        parts = f.split(os.sep)
        dimension = parts[-3]
        m = re.search(r"seed_(\d+)\.csv$", parts[-1])
        if not m:
            continue
        seed = int(m.group(1))

        rows.append({
            "mode": "multi_marker",
            "dimension": dimension,
            "scenario": "all",
            "seed": seed,
            "is_3d": ("3D" in dimension.upper()),
            "input_csv": f,
        })

    df = pd.DataFrame(rows)
    if df.shape[0] == 0:
        print("No simulated CSVs found. Check SIM_DATA_ROOT and your directory structure.")
    return df


inputs_df = discover_simulated_data(SIM_DATA_ROOT)
print("Total simulations found:", inputs_df.shape[0])
inputs_df.head()

### Convert your simulated CSV → Baysor molecule table

In [None]:
def simulated_to_baysor_table(sim_csv: str, is_3d: bool) -> pd.DataFrame:
    df = pd.read_csv(sim_csv).reset_index(drop=True)

    # Safety checks
    required_cols = {"global_x", "global_y", "global_z", "target"}
    missing = required_cols - set(df.columns)
    if missing:
        raise ValueError(f"{sim_csv} missing columns: {missing}")

    df["transcript_id"] = df.index.astype(int)

    baysor_df = pd.DataFrame({
        "transcript_id": df["transcript_id"].astype(int),
        "x": df["global_x"].astype(float),
        "y": df["global_y"].astype(float),
        "z": df["global_z"].astype(float) if is_3d else 0.0,
        "gene": df["target"].astype(str),
    })
    return baysor_df

### Output path mapping (mirror directory layout)

In [None]:
def make_baysor_out_paths(sim_csv: str, sim_root: str, out_root: str):
    rel = os.path.relpath(sim_csv, sim_root)        # e.g., single_marker/3D/D/seed_1.csv
    rel_no_ext = os.path.splitext(rel)[0]           # e.g., single_marker/3D/D/seed_1
    out_dir = os.path.join(out_root, rel_no_ext)    # e.g., baysor_outputs/single_marker/3D/D/seed_1
    spheres_csv = os.path.join(out_dir, "baysor_spheres.csv")
    log_txt = os.path.join(out_dir, "baysor_cmd.log")
    return out_dir, spheres_csv, log_txt

### Run Baysor CLI (subprocess)

In [None]:
def run_baysor_cli(
    in_csv: str,
    out_dir: str,
    is_3d: bool,
    min_molecules_per_cell: int,
    scale: float,
    n_threads: int = 8,
    make_plots: bool = True,
    timeout_sec: int | None = None,
) -> str:
    """
    Runs Baysor and returns path to segmentation.csv in out_dir.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    env = os.environ.copy()
    env["JULIA_NUM_THREADS"] = str(n_threads)

    cmd = ["baysor", "run", "-x", "x", "-y", "y", "-g", "gene"]
    if is_3d:
        cmd += ["-z", "z"]

    cmd += ["-m", str(min_molecules_per_cell), "-s", str(scale)]

    if make_plots:
        cmd += ["--plot"]

    cmd += ["-o", str(out_dir), in_csv]

    subprocess.run(cmd, check=True, env=env, timeout=timeout_sec)

    seg_csv = out_dir / "segmentation.csv"
    if not seg_csv.exists():
        raise FileNotFoundError(f"Expected {seg_csv} not found. Files: {list(out_dir.iterdir())[:20]}")

    return str(seg_csv)

### Convert Baysor segmentation → “detection spheres” (miniball)

In [None]:
def baysor_segmentation_to_spheres(
    seg_csv: str,
    baysor_input_csv: str,
    miniball_epsilon: float = 1e-4,
    min_points_per_segment: int = 3,
) -> pd.DataFrame:
    """
    Reads segmentation.csv + molecule input CSV (x,y,z) and produces bounding spheres.
    Assumes segmentation.csv has columns: transcript_id and cell
    """
    seg = pd.read_csv(seg_csv)
    mol = pd.read_csv(baysor_input_csv, usecols=["transcript_id", "x", "y", "z"])

    if "transcript_id" not in seg.columns:
        raise ValueError(f"{seg_csv} missing transcript_id. Columns: {list(seg.columns)}")
    if "cell" not in seg.columns:
        raise ValueError(f"{seg_csv} missing cell. Columns: {list(seg.columns)}")

    merged = seg.merge(mol, on="transcript_id", how="inner")

    # typical conventions: cell==0 means unassigned
    merged = merged[merged["cell"] != 0]

    # optional noise filter if present
    if "is_noise" in merged.columns:
        merged = merged[merged["is_noise"].astype(str).str.lower() != "true"]

    rows = []
    for cell_id, g in merged.groupby("cell", sort=False):
        coords = g[["x", "y", "z"]].to_numpy(dtype=float)
        n = coords.shape[0]
        if n < min_points_per_segment:
            continue

        center, r2 = miniball.get_bounding_ball(coords, epsilon=miniball_epsilon)

        rows.append({
            "sphere_x": float(center[0]),
            "sphere_y": float(center[1]),
            "sphere_z": float(center[2]),
            "sphere_r": float(math.sqrt(r2)),
            "cell_id": int(cell_id),
            "n_molecules": int(n),
        })

    return pd.DataFrame(rows)

### Main loop: run Baysor over all saved simulations

In [None]:
def run_all_baysor(
    inputs_df: pd.DataFrame,
    sim_root: str,
    out_root: str,
    min_mols: int,
    scale: float,
    threads: int,
    resume_if_done: bool = True,
    make_plots: bool = True,
    limit_n: int | None = None,
) -> pd.DataFrame:
    logs = []
    df = inputs_df.copy()

    if limit_n is not None:
        df = df.head(limit_n)

    for _, row in df.iterrows():
        sim_csv = row["input_csv"]
        is_3d = bool(row["is_3d"])

        out_dir, spheres_csv, cmd_log = make_baysor_out_paths(sim_csv, sim_root, out_root)

        if resume_if_done and os.path.exists(spheres_csv):
            logs.append({
                **row,
                "status": "skipped_exists",
                "out_dir": out_dir,
                "spheres_csv": spheres_csv,
                "n_spheres": None,
                "runtime_sec": 0.0,
                "error": None,
            })
            continue

        t0 = time.time()

        # Make temp Baysor input in a temp folder
        try:
            baysor_df = simulated_to_baysor_table(sim_csv, is_3d)

            with tempfile.TemporaryDirectory() as tmpdir:
                in_csv = os.path.join(tmpdir, "molecules.csv")
                baysor_df.to_csv(in_csv, index=False)

                # run baysor
                seg_csv = run_baysor_cli(
                    in_csv=in_csv,
                    out_dir=out_dir,
                    is_3d=is_3d,
                    min_molecules_per_cell=min_mols,
                    scale=scale,
                    n_threads=threads,
                    make_plots=make_plots,
                )

                # convert to spheres (needs baysor_input_csv path; use in_csv before temp deleted)
                spheres = baysor_segmentation_to_spheres(seg_csv, in_csv)

            os.makedirs(out_dir, exist_ok=True)
            spheres.to_csv(spheres_csv, index=False)

            logs.append({
                **row,
                "status": "ok",
                "out_dir": out_dir,
                "spheres_csv": spheres_csv,
                "n_spheres": int(spheres.shape[0]),
                "runtime_sec": float(time.time() - t0),
                "error": None,
            })

        except Exception as e:
            logs.append({
                **row,
                "status": "failed",
                "out_dir": out_dir,
                "spheres_csv": spheres_csv,
                "n_spheres": None,
                "runtime_sec": float(time.time() - t0),
                "error": repr(e),
            })

    return pd.DataFrame(logs)


logs_df = run_all_baysor(
    inputs_df=inputs_df,
    sim_root=SIM_DATA_ROOT,
    out_root=BAYSOR_OUT_ROOT,
    min_mols=DEFAULT_MIN_MOLS,
    scale=DEFAULT_SCALE,
    threads=DEFAULT_THREADS,
    resume_if_done=RESUME_IF_DONE,
    make_plots=KEEP_BAYSOR_PLOTS,
    limit_n=LIMIT_N,
)

logs_df["status"].value_counts()

### Save logs + create an index file for main.ipynb

In [None]:
# Save logs
Path(BAYSOR_OUT_ROOT).mkdir(parents=True, exist_ok=True)
log_path = os.path.join(BAYSOR_OUT_ROOT, "baysor_run_log.csv")
logs_df.to_csv(log_path, index=False)
print("Saved log:", log_path)

# Index of sphere files for downstream evaluation
index_df = logs_df[logs_df["status"].isin(["ok", "skipped_exists"])][
    ["mode", "dimension", "scenario", "seed", "spheres_csv", "out_dir"]
].copy()

index_path = os.path.join(BAYSOR_OUT_ROOT, "baysor_spheres_index.csv")
index_df.to_csv(index_path, index=False)
print("Saved index:", index_path)

print("\nStatus counts:")
print(logs_df["status"].value_counts())

index_df.head()

### (Optional) Quick smoke test visualization for one sample

In [None]:
import matplotlib.pyplot as plt

def quick_plot_spheres(sim_csv, spheres_csv, max_points=5000, max_spheres=500):
    sim = pd.read_csv(sim_csv)
    sph = pd.read_csv(spheres_csv)

    # subsample
    if sim.shape[0] > max_points:
        sim = sim.sample(max_points, random_state=0)
    if sph.shape[0] > max_spheres:
        sph = sph.sample(max_spheres, random_state=0)

    plt.figure(figsize=(7, 7))
    plt.scatter(sim["global_x"], sim["global_y"], s=1)
    plt.scatter(sph["sphere_x"], sph["sphere_y"], s=10)
    plt.title("Simulated transcripts (dots) + Baysor sphere centers (points)")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.show()


# Example: pick first OK row
ok_rows = logs_df[logs_df["status"] == "ok"]
if ok_rows.shape[0] > 0:
    r = ok_rows.iloc[0]
    quick_plot_spheres(r["input_csv"], r["spheres_csv"])
else:
    print("No successful Baysor runs to plot.")