In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "ProgressMeter",
    "DataFrames",
    "uCSV"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
reference_assemblies = filter(x -> occursin(r"\.fna$", x) && !occursin("normalized", x), readdir(genome_dir, join=true))

In [None]:
reference_assembly = first(reference_assemblies)

In [None]:
assembly_table = DataFrames.DataFrame(
    reference_assembly = String[],
    assembler = String[],
    coverage = String[],
    directory = String[],
    fasta = String[]
)

assemblers = [
    "megahit",
    "spades_isolate",
    "flye",
    "raven",
    # "hifiasm",
    # "hicanu",
    # "mycelia"
]
coverages = ["10x", "100x", "1000x"]

for reference_assembly in reference_assemblies
    for assembler in assemblers
        for coverage in coverages
            # occursin("$(coverage)_$(assembler)", x)
            directories = filter(x -> occursin(reference_assembly, x) && isdir(x) && occursin(assembler, x) && occursin(coverage, x), readdir(genome_dir, join=true))
            # @assert length(directories) == 1 directories
            if isempty(directories)
                directory = missing
            else
                @assert length(directories) == 1 "$(reference_assembly) $(assembler) $(coverage)"
                directory = first(directories)
            end
            if assembler == "megahit"
                fasta = joinpath(directory, "final.contigs.fa")
            elseif assembler == "spades_isolate"
                fasta = joinpath(directory, "scaffolds.fasta")
            elseif assembler == "flye"
                fasta = joinpath(directory, "assembly.fasta")
            elseif assembler == "raven"
                fasta = joinpath(directory, "assembly.fasta")
            end
            # elseif assembler = 
            push!(assembly_table, (;reference_assembly, assembler, coverage, directory, fasta))
        end
    end
end
assembly_table[!, "reference_assembly"] = replace.(assembly_table[!, "reference_assembly"], "$(genome_dir)/" => "")
assembly_table[!, "fasta"] = replace.(assembly_table[!, "fasta"], "$(genome_dir)/" => "")
assembly_table

In [None]:
CORES = 8
MEMORY = CORES * 4
ProgressMeter.@showprogress for row in DataFrames.eachrow(assembly_table)
    outdir = joinpath(genome_dir, row["fasta"] * "-cactus")
    if !isdir(outdir)

        table = DataFrames.DataFrame(
            samples = ["REFERENCE", "ALTERNATE"],
            file_paths = [row["reference_assembly"], row["fasta"]]
        )
        cactus_config_file = "$(joinpath(genome_dir, row["fasta"]))-cactus-config.txt"
        uCSV.write(cactus_config_file, data=collect(DataFrames.eachcol(table)), header=missing, delim='\t')
        jobstore = row["fasta"] * "-cactus-job-store"
        # @show jobstore
        config = replace(cactus_config_file, "$(genome_dir)/" => "")
        # @show config
        out = row["fasta"] * "-cactus"
        
        
        # too small to need to use sbatch - NERSC doesn't really have allocations this small
#         cmd = """
#         podman-hpc run -it -v $(genome_dir):/app -w /app quay.io/comparative-genomics-toolkit/cactus:v2.8.1 cactus-pangenome \
#         ./$(jobstore) \
#         ./$(config) \
#         --maxCores $(CORES) \
#         --maxMemory $(MEMORY)Gb \
#         --outDir $(out) \
#         --outName $(out) \
#         --reference REFERENCE \
#         --gbz \
#         --gfa \
#         --vcf \
#         --odgi
#         """

#         Mycelia.nersc_sbatch(
#             job_name=eplace(out, "/" => "_"),
#             mail_user="cameron.prybol@gmail.com",
#             cpus_per_task=CORES,
#             mem_gb=MEMORY,
#             logdir=mkpath("$(homedir())/workspace/slurmlogs"),
#             cmd=cmd)
        
        cmd = `
        podman-hpc run -it -v $(genome_dir):/app -w /app quay.io/comparative-genomics-toolkit/cactus:v2.8.1 cactus-pangenome
        ./$(jobstore)
        ./$(config)
        --maxCores $(CORES)
        --maxMemory $(MEMORY)Gb
        --outDir $(out)
        --outName $(out)
        --reference REFERENCE
        --gbz
        --gfa
        --vcf
        --odgi
        `
        
        logpath = "$(homedir())/workspace/slurmlogs/$(replace(out, "/" => "_"))"
        
        try
            run(pipeline(cmd, stdout=logpath, stderr=logpath))
        catch
            @info "check logs @ $(logpath)"
        end
    else
        @info "$(outdir) already exists"
    end
end

In [None]:
# isdir(joinpath(genome_dir, "cactus-job-store")) && rm(joinpath(genome_dir, "cactus-job-store"), recursive=true)