In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
db = "nt"

In [None]:
path_to_db = joinpath(homedir(), "workspace", "blastdb", db)
# path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")

In [None]:
function system_mem_to_minimap_index_size(;system_mem_gb, denominator=6)
    # heuristic I arrived at by trial and error
    # this value should be larger for larger memory machines, and smaller for smaller ones
    # denominator = 5 # produced OOM for NT on NERSC
    # 10 was only 56% efficient
    value = Int(floor(system_mem_gb/denominator))
    return "$(value)G"
end
system_mem_to_minimap_index_size(Mycelia.NERSC_MEM)

"""
Run this on the machine you intend to use to map the reads to confirm the index will fit
"""
function minimap_index(;fasta, mem_gb, mapping_type, threads, as_string=false, denominator=6)
    @assert mapping_type in ["map-hifi", "map-ont", "map-pb", "sr", "lr:hq"]
    index_size = system_mem_to_minimap_index_size(system_mem_gb=mem_gb, denominator=denominator)
    outfile = "$(fasta).x$(mapping_type).I$(index_size).mmi"
    if as_string
        cmd = "$(Mycelia.CONDA_RUNNER) run --live-stream -n minimap2 minimap2 -t $(threads) -x $(mapping_type) -I$(index_size) -d $(outfile) $(fasta)"
    else
        cmd = `$(Mycelia.CONDA_RUNNER) run --live-stream -n minimap2 minimap2 -t $(threads) -x $(mapping_type) -I$(index_size) -d $(index_file) $(fasta)`
    end
    return (;cmd, outfile)
end

In [None]:
# compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
compressed_fasta_export = path_to_db * ".fna.gz"

In [None]:
# inefficient memory usage, but did succeed
# cmd, outfile = minimap_index(fasta=compressed_fasta_export, mem_gb=Mycelia.NERSC_MEM, mapping_type="map-hifi", threads=Mycelia.NERSC_CPU, as_string=true, denominator=5)
# inefficient memory usage, but did succeed
# cmd, outfile = minimap_index(fasta=compressed_fasta_export, mem_gb=Mycelia.NERSC_MEM, mapping_type="map-hifi", threads=Mycelia.NERSC_CPU, as_string=true, denominator=10)
cmd, outfile = minimap_index(fasta=compressed_fasta_export, mem_gb=Mycelia.NERSC_MEM, mapping_type="map-hifi", threads=Mycelia.NERSC_CPU, as_string=true, denominator=6)

In [None]:
if !isfile(outfile)
    println(outfile)
    Mycelia.nersc_sbatch_premium(
        job_name=basename(outfile),
        mail_user="cameron.prybol@gmail.com",
        logdir=mkpath("$(homedir())/workspace/slurmlogs"),
        mem_gb=Mycelia.NERSC_MEM,
        cpus_per_task=Mycelia.NERSC_CPU,
        cmd=cmd)
# else
#     @show isfile(outfile)
end