In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
# Pkg.activate(;temp=true)
Pkg.activate("20240827.minimap2-reads-smallest-first")
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "uCSV",
    "OrderedCollections",
    "CSV"
    # "XAM",
    # "CodecZlib"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
db = "nt"
path_to_db = joinpath(homedir(), "workspace", "blastdb", db)
# path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")
# compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
compressed_fasta_export = path_to_db * ".fna.gz"

In [None]:
threads = 12
mem_gb = threads * 32
# mem_gb = # Int(floor(mem_gb * .8))

In [None]:
sra_dirs = readdir(joinpath(data_dir, "SRA"), join=true)

In [None]:
# sort SRA dirs by size so that smallest jobs will run first

In [None]:
sra_filesizes = []
for sra_dir in sra_dirs
    trim_galore_dir_contents = readdir(joinpath(sra_dir, "trim_galore"), join=true)
    forward = first(filter(f -> occursin(r"_1_val_1\.fq\.gz$", f), trim_galore_dir_contents))
    reverse = first(filter(f -> occursin(r"_2_val_2\.fq\.gz$", f), trim_galore_dir_contents))
    push!(sra_filesizes, sra_dir => filesize(forward) + filesize(reverse))
end
size_sorted_sra_directories = first.(sort(sra_filesizes, by=x->x[2]))

In [None]:
# i = 1
# i = 2
# i = 3
# i = 4
# i = 5
# done through above
# next to submit is below
i = 6
N=min(2^i, length(size_sorted_sra_directories))

In [None]:
for sra_dir in size_sorted_sra_directories[1:N]
    trim_galore_dir_contents = readdir(joinpath(sra_dir, "trim_galore"), join=true)
    forward = first(filter(f -> occursin(r"_1_val_1\.fq\.gz$", f), trim_galore_dir_contents))
    reverse = first(filter(f -> occursin(r"_2_val_2\.fq\.gz$", f), trim_galore_dir_contents))
    map_result = Mycelia.minimap_map_paired_end_with_index(fasta = compressed_fasta_export, forward = forward, reverse =reverse, mem_gb = Int(floor(mem_gb * .8)), threads=threads, as_string=true)
    outfile = map_result.outfile
    cmd = map_result.cmd
    # resubmit if we run out of time
    if !isfile(outfile)
        println(outfile)
        Mycelia.scg_sbatch(
            job_name=basename(outfile),
            mail_user="cameron.prybol@gmail.com",
            logdir=mkpath("$(homedir())/workspace/slurmlogs"),
            mem_gb=mem_gb,
            cpus_per_task=threads,
            partition="batch",
            account="mpsnyder",
            time="7-00:00:00",
            cmd=cmd)
        # set this to be the runtime of the mapping jobs
        sleep(60)
    else
        @show isfile(outfile)
        @show Base.format_bytes(filesize(outfile))
    end
end