In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "ProgressMeter",
    "DataFrames",
    "uCSV"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
reference_assemblies = filter(x -> occursin(r"\.fna$", x) && !occursin("normalized", x), readdir(genome_dir, join=true))

In [None]:
reference_assembly = first(reference_assemblies)

In [None]:
assembly_table = DataFrames.DataFrame(
    reference_assembly = String[],
    assembler = String[],
    coverage = String[],
    directory = String[],
    fasta = String[]
)

assemblers = [
    "megahit",
    "spades_isolate",
    "flye",
    "raven",
    # "hifiasm",
    # "hicanu",
    # "mycelia"
]
coverages = ["10x", "100x", "1000x"]

for reference_assembly in reference_assemblies
    for assembler in assemblers
        for coverage in coverages
            # occursin("$(coverage)_$(assembler)", x)
            directories = filter(x -> occursin(reference_assembly, x) && isdir(x) && occursin(assembler, x) && occursin(coverage, x), readdir(genome_dir, join=true))
            # @assert length(directories) == 1 directories
            if isempty(directories)
                directory = missing
            else
                @assert length(directories) == 1 "$(reference_assembly) $(assembler) $(coverage)"
                directory = first(directories)
            end
            if assembler == "megahit"
                fasta = joinpath(directory, "final.contigs.fa")
            elseif assembler == "spades_isolate"
                fasta = joinpath(directory, "scaffolds.fasta")
            elseif assembler == "flye"
                fasta = joinpath(directory, "assembly.fasta")
            elseif assembler == "raven"
                fasta = joinpath(directory, "assembly.fasta")
            end
            # elseif assembler = 
            push!(assembly_table, (;reference_assembly, assembler, coverage, directory, fasta))
        end
    end
end
assembly_table

In [None]:
# readdir(assembly_table[10, "directory"])

In [None]:
table = DataFrames.DataFrame(
    samples = ["REFERENCE", "ALTERNATE"],
    file_paths = [reference_assemblies_by_id[identifier],reference_variant_assemblies_by_id[identifier]]
)
cactus_config_file = "$(genome_dir)/$(identifier)-cactus-config.txt"

uCSV.write(data=collect(DataFrames.eachcol(table)), cactus_config_file, header=missing, delim='\t')

CORES = 8
MEMORY = 64

# Mycelia.add_bioconda_env("cactus")
# cmd = 
# `$(Mycelia.MAMBA) run --live-stream -n cactus cactus-pangenome 
# --mgCores $CORES
# --mapCores $CORES
# --consCores $CORES
# --indexCores $CORES
# --mgMemory $MEMORY
# --consMemory $MEMORY
# --indexMemory $MEMORY
# --maxMemory $MEMORY
# --gbz
# --gfa
# --vcf
# --odgi
# --reference REFERENCE
# --outName cactus-test-prefix
# --outDir ./cactus-test-out
# ./cactus-job-store cactus-test.txt`

# rm(joinpath(genome_dir, "cactus-job-store"), recursive=true)

quay.io/comparative-genomics-toolkit/cactus:latest
# quay.io/comparative-genomics-toolkit/cactus:v2.7.2


cmd = `podman-hpc run -it -v $(genome_dir):/app -w /app quay.io/comparative-genomics-toolkit/cactus:v2.8.1 cactus-pangenome
--mgCores $CORES
--mapCores $CORES
--consCores $CORES
--indexCores $CORES
--mgMemory $(MEMORY)Gb
--consMemory $(MEMORY)Gb
--indexMemory $(MEMORY)Gb
--maxMemory $(MEMORY)Gb
--gbz
--gfa
--vcf
--odgi
--reference REFERENCE
--outName $(identifier)-cactus
--outDir $(identifier)-cactus
./cactus-job-store $(basename(cactus_config_file))`
run(cmd)
isdir(joinpath(genome_dir, "cactus-job-store")) && rm(joinpath(genome_dir, "cactus-job-store"), recursive=true)