# Baysor Batch Processing

This notebook:
1. Searches a folder for `*_cleaned.csv` transcript files
2. Skips files that already have completed segmentation
3. Runs Baysor segmentation on the remaining files

In [7]:
import Pkg
Pkg.activate("/Users/christoffer/Baysor")
using Baysor
using DataFrames
using CSV

println("Threads: ", Threads.nthreads())

[32m[1m  Activating[22m[39m project at `~/Baysor`


Threads: 16


## Configuration

In [8]:
# Set the root folder to search for cleaned transcript files
input_folder = "/Users/christoffer/Downloads/new_spinal_cord_data_CG"

# Baysor parameters
baysor_params = (
    x_column = :global_x,
    y_column = :global_y,
    z_column = :global_z,
    gene_column = :gene,
    min_molecules_per_cell = 50,
    n_clusters = 20,
    scale = 4.0
)

(x_column = :global_x, y_column = :global_y, z_column = :global_z, gene_column = :gene, min_molecules_per_cell = 50, n_clusters = 20, scale = 4.0)

## Find Cleaned Transcript Files and Check for Existing Segmentation

In [9]:
"""
Check if segmentation already exists for the given parameters.
Looks for folders like 'baysor_m50_scale4', 'm50_scale4', 'm50_s4' etc.
and checks if they contain a segmentation.csv file.
"""
function segmentation_exists(csv_dir::String, min_molecules::Int, scale::Float64)::Bool
    # Generate possible folder name patterns
    m = Int(min_molecules)
    s = Int(scale) == scale ? Int(scale) : scale
    
    possible_patterns = [
        "baysor_m$(m)_scale$(s)",
        "baysor_m$(m)_s$(s)",
        "m$(m)_scale$(s)",
        "m$(m)_s$(s)",
        "baysor_segmentation",
    ]
    
    # Check each possible folder
    for pattern in possible_patterns
        folder_path = joinpath(csv_dir, pattern)
        if isdir(folder_path)
            # Check for segmentation.csv or similar files
            for filename in readdir(folder_path)
                if occursin("segmentation", lowercase(filename)) && endswith(lowercase(filename), ".csv")
                    return true
                end
            end
        end
    end
    
    return false
end

"""
Recursively search for cleaned transcript files (*_cleaned.csv).
Excludes files like 'detected_transcripts_baysor_cleaned.csv'.
"""
function find_cleaned_transcript_files(folder::String; 
                                        min_molecules::Int=50, 
                                        scale::Float64=4.0)::Tuple{Vector{String}, Vector{String}}
    files_to_process = String[]
    files_skipped = String[]
    
    for (root, dirs, filenames) in walkdir(folder)
        for filename in filenames
            lc_filename = lowercase(filename)
            
            # Match files ending with _cleaned.csv but NOT _baysor_cleaned.csv
            if endswith(lc_filename, "_cleaned.csv") && 
               !occursin("_baysor_cleaned", lc_filename) &&
               !occursin("baysor_cleaned", lc_filename)
                
                filepath = joinpath(root, filename)
                
                # Check if segmentation already exists
                if segmentation_exists(root, min_molecules, scale)
                    push!(files_skipped, filepath)
                else
                    push!(files_to_process, filepath)
                end
            end
        end
    end
    
    return files_to_process, files_skipped
end

# Find all cleaned transcript files
files_to_process, files_skipped = find_cleaned_transcript_files(
    input_folder; 
    min_molecules=Int(baysor_params.min_molecules_per_cell),
    scale=baysor_params.scale
)

println("=" ^ 80)
println("FILES TO PROCESS ($(length(files_to_process))):")
println("=" ^ 80)
for f in files_to_process
    println("  $f")
end

println("\n" * "=" ^ 80)
println("SKIPPED - Segmentation already exists ($(length(files_skipped))):")
println("=" ^ 80)
for f in files_skipped
    println("  $f")
end

FILES TO PROCESS (15):
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide20/region_7d-1/detected_transcripts_cleaned.csv
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide20/region_7d-2/detected_transcripts_cleaned.csv
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide20/region_9d-1/detected_transcripts_cleaned.csv
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide20/region_9d-2/detected_transcripts_cleaned.csv
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide20/region_9d-3/detected_transcripts_cleaned.csv
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide21-DD-d3d4d14liver-d14myelin/region_liver-d14-1/detected_transcripts_cleaned.csv
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide21-DD-d3d4d14liver-d14myelin/region_liver-d3-1/detected_transcripts_cleaned.csv
  /Users/christoffer/Downloads/new_spinal_cord_data_CG/slide21-DD-d3d4d14liver-d14myelin/region_liver-d3-2/detected_transcripts_cleaned.csv
  /Users/christo

## Run Baysor on Files That Need Processing

In [None]:
"""
Run Baysor on a cleaned transcript file.
Output folder is named based on parameters: m{min_molecules}_s{scale}
"""
function run_baysor(input_path::String; params...)
    println("\n" * "="^80)
    println("Running Baysor on: $input_path")
    println("="^80)
    
    # Determine output prefix based on parameters
    dir = dirname(input_path)
    m = Int(params[:min_molecules_per_cell])
    s = params[:scale]
    s_str = Int(s) == s ? string(Int(s)) : string(s)
    output_folder = joinpath(dir, "m$(m)_s$(s_str)")
    mkpath(output_folder)
    output_prefix = joinpath(output_folder, "segmentation")
    
    println("Output folder: $output_folder")
    
    # Run Baysor
    Baysor.CommandLine.run(
        input_path;
        x_column = params[:x_column],
        y_column = params[:y_column],
        z_column = params[:z_column],
        gene_column = params[:gene_column],
        min_molecules_per_cell = params[:min_molecules_per_cell],
        n_clusters = params[:n_clusters],
        scale = params[:scale],
        output = output_prefix
    )
    
    println("\nCompleted: $input_path")
    return output_folder
end

# Run Baysor on all files that need processing
results = []

for (i, file) in enumerate(files_to_process)
    println("\n[$(i)/$(length(files_to_process))]")
    try
        output = run_baysor(file; baysor_params...)
        push!(results, (file=file, status="success", output=output))
    catch e
        println("\nERROR running Baysor on $file:")
        println(e)
        push!(results, (file=file, status="error", error=string(e)))
    end
end

println("\n" * "="^80)
println("BAYSOR PROCESSING COMPLETE")
println("="^80)
println("\nSummary:")
n_success = count(r -> r.status == "success", results)
n_error = count(r -> r.status == "error", results)
println("  Processed: $(n_success)")
println("  Errors: $(n_error)")
println("  Skipped (already done): $(length(files_skipped))")

## (Optional) Run Baysor on a Single File

In [None]:
# Uncomment to run Baysor on a specific cleaned file
#=
single_file = "/path/to/detected_transcripts_cleaned.csv"
run_baysor(single_file; baysor_params...)
=#