# Baysor Output Diagnostics

This notebook scans for Baysor output folders and summarizes key QC metrics from
`segmentation.csv`, `segmentation_cell_stats.csv`, and `segmentation_counts.tsv`.


## 1. Configure the output root
Set this to the directory where Baysor outputs live. `.` searches the repo recursively.


In [None]:
output_root = "."


## 2. Load packages
If these are missing, install them with `Pkg.add("CSV")` etc.


In [None]:
using CSV
using DataFrames
using Statistics


## 3. Find Baysor output folders


In [None]:
function find_outputs(root::AbstractString)
    dirs = String[]
    for (dir, _, files) in walkdir(root)
        if "segmentation.csv" in files
            push!(dirs, dir)
        end
    end
    sort!(dirs)
    return dirs
end

outputs = find_outputs(output_root)
println("Found $(length(outputs)) outputs")


## 4. Summarize outputs


In [None]:
function pick_column(names_vec, candidates)
    lower = Dict(lowercase(String(n)) => n for n in names_vec)
    for cand in candidates
        n = get(lower, cand, nothing)
        if n !== nothing
            return n
        end
    end
    return nothing
end

function read_segmentation_summary(path)
    df = CSV.read(path, DataFrame)
    n_transcripts = nrow(df)
    cell_col = pick_column(names(df), ["cell", "cell_id", "cellid", "cell_index"])
    if cell_col === nothing
        return (n_transcripts=n_transcripts, n_cells=missing, unassigned_frac=missing)
    end
    cell_raw = df[!, cell_col]
    cell_ids = [tryparse(Int, string(x)) for x in cell_raw]
    valid = [c for c in cell_ids if c !== nothing]
    if isempty(valid)
        return (n_transcripts=n_transcripts, n_cells=missing, unassigned_frac=missing)
    end
    n_cells = length(unique(filter(>(0), valid)))
    unassigned = count(c -> c <= 0, valid)
    unassigned_frac = unassigned / length(valid)
    return (n_transcripts=n_transcripts, n_cells=n_cells, unassigned_frac=unassigned_frac)
end

function read_cell_stats_summary(path)
    df = CSV.read(path, DataFrame)
    cell_col = pick_column(names(df), ["cell", "cell_id", "cellid", "cell_index"])
    count_col = pick_column(names(df), ["n_transcripts", "n_molecules", "n_counts", "n_genes"])
    n_cells = cell_col === nothing ? nrow(df) : length(unique(df[!, cell_col]))
    mean_counts = count_col === nothing ? missing : mean(skipmissing(df[!, count_col]))
    median_counts = count_col === nothing ? missing : median(skipmissing(df[!, count_col]))
    return (n_cells=n_cells, mean_counts=mean_counts, median_counts=median_counts)
end

function file_size_mb(path)
    return round(stat(path).size / 1024^2; digits=2)
end

rows = NamedTuple[]
for dir in outputs
    seg_path = joinpath(dir, "segmentation.csv")
    stats_path = joinpath(dir, "segmentation_cell_stats.csv")
    counts_path = joinpath(dir, "segmentation_counts.tsv")

    seg = read_segmentation_summary(seg_path)
    stats = isfile(stats_path) ? read_cell_stats_summary(stats_path) : (n_cells=missing, mean_counts=missing, median_counts=missing)
    counts_size = isfile(counts_path) ? file_size_mb(counts_path) : missing

    push!(rows, (
        output_dir=dir,
        n_transcripts=seg.n_transcripts,
        n_cells=seg.n_cells,
        unassigned_frac=seg.unassigned_frac,
        cell_stats_cells=stats.n_cells,
        mean_counts=stats.mean_counts,
        median_counts=stats.median_counts,
        counts_size_mb=counts_size,
        has_counts=isfile(counts_path)
    ))
end

summary = DataFrame(rows)
summary


## 5. Inspect a single output directory
Pick one output folder to dig deeper. This previews a few rows and gives a simple
per-cell transcript count summary if a cell column is available.


In [None]:
selected_output = isempty(outputs) ? nothing : first(outputs)
selected_output


In [None]:
if selected_output === nothing
    println("No outputs found. Check output_root.")
else
    seg_path = joinpath(selected_output, "segmentation.csv")
    seg_preview = CSV.read(seg_path, DataFrame; limit=5)
    display(seg_preview)

    cell_col = pick_column(names(seg_preview), ["cell", "cell_id", "cellid", "cell_index"])
    if cell_col === nothing
        println("No cell column found in segmentation.csv")
    else
        seg_full = CSV.read(seg_path, DataFrame)
        cell_ids = [tryparse(Int, string(x)) for x in seg_full[!, cell_col]]
        valid = filter(c -> c !== nothing && c > 0, cell_ids)
        if isempty(valid)
            println("No assigned cells detected in segmentation.csv")
        else
            per_cell = combine(groupby(DataFrame(cell=valid), :cell), nrow => :n_transcripts)
            println("Per-cell transcript count summary (assigned cells only):")
            display(describe(per_cell))
        end
    end
end
