In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
db = "20240418.nt"

In [None]:
path_to_db = joinpath(ENV["SCRATCH"], "workspace", "blastdb", db)
# path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")

In [None]:
# compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
compressed_fasta_export = path_to_db * ".fna.gz"

In [None]:
@assert isfile(compressed_fasta_export)

In [None]:
Mycelia.DEFAULT_MINIMAP_DENOMINATOR

In [None]:
# Lawrencium LR7 specs
threads = 56
# 95% of 512Gb
mem_gb = 486
cmd, outfile = Mycelia.minimap_index(fasta=compressed_fasta_export, mem_gb=mem_gb, threads=threads, mapping_type="sr", as_string=true, denominator=10)
@assert isfile(outfile)

In [None]:
# Mycelia.add_bioconda_env("minimap2")

In [None]:
rclone = "/global/software/rocky-8.x86_64/gcc/linux-rocky8-x86_64/gcc-8.5.0/rclone-1.63.1-5mixypplqx7n7i7g7hocthz35pzbzm2g/bin/rclone"

In [None]:
run(`$(rclone) listremotes`)

In [None]:
SCRATCH_BASE = mkpath(joinpath(ENV["SCRATCH"], "workspace", "Mycelia", "projects", "viral-exposome", "data", "SRA"))
@assert isdir(SCRATCH_BASE)

In [None]:
# can go to 10?
# 20?
# batch_size = 4
batch_size = 16
current_start = 80
current_end = current_start + batch_size
# current_end = 21

In [None]:
# should be starting on 8?
remote_sra_dir = "stanford_viral_exposome:viral-exposome/data/SRA"
sra_directories = replace.(readlines(`$(rclone) lsf $(remote_sra_dir)`), "/" => "")
for sra_dir in sra_directories[current_start:current_end]
    trim_galore_dir = joinpath(remote_sra_dir, sra_dir, "trim_galore")
    directory_contents = readlines(`$(rclone) lsf $(trim_galore_dir)`)
    has_been_mapped = length(filter(x -> occursin(r"nt.fna.gz.xsr.I[\d]+G.mmi.minimap2.sam.gz", x), directory_contents)) == 1
    has_been_extracted = length(filter(x -> occursin(r"nt.fna.gz.xsr.I[\d]+G.mmi.minimap2.sam.query-ref.tsv.gz", x), directory_contents)) == 1
    @show sra_dir
    @show has_been_mapped
    @show has_been_extracted
    if has_been_mapped && has_been_extracted
        println("done")
        continue
    elseif has_been_mapped && !has_been_extracted
        # TODO extract me!
        println("extract me!")
        continue
    else
        @assert !has_been_extracted

        local_sra_dir = joinpath(SCRATCH_BASE, sra_dir)

        forward = first(filter(f -> occursin(r"_1_val_1\.fq\.gz$", f), directory_contents))
        remote_forward = joinpath(trim_galore_dir, forward)
        local_forward = joinpath(local_sra_dir, forward)

        reverse = first(filter(f -> occursin(r"_2_val_2\.fq\.gz$", f), directory_contents))
        remote_reverse = joinpath(trim_galore_dir, reverse)
        local_reverse = joinpath(local_sra_dir, reverse)

        @show forward, remote_forward, local_forward
        @show reverse, remote_reverse, local_reverse

        if !isfile(local_forward)
            run(`$(rclone) copy --verbose $(remote_forward) $(dirname(local_forward))`)
            @assert isfile(local_forward)
        end
        if !isfile(local_reverse)
            run(`$(rclone) copy --verbose $(remote_reverse) $(dirname(local_reverse))`)
            @assert isfile(local_reverse)
        end
        
        map_result = Mycelia.minimap_map_paired_end_with_index(fasta = compressed_fasta_export, forward = local_forward, reverse = local_reverse, mem_gb = mem_gb, threads=threads, as_string=true)
        outfile = map_result.outfile
        cmd = map_result.cmd
        # @show cmd
        rclone_upload = "$(rclone) copy --verbose --drive-chunk-size 2G --drive-upload-cutoff 1T --tpslimit 1 --low-level-retries=10 --retries=10 --retries-sleep=10s  $(outfile) $(trim_galore_dir)"
        rm_local_directory = "rm -r $(local_sra_dir)"
        cmd = rstrip(cmd) * " \\\n&& " * rclone_upload * " \\\n&& " * rm_local_directory
        println(cmd)
        # resubmit if we run out of time
        if !isfile(outfile)
            println(outfile)
            Mycelia.lawrencium_sbatch(
                job_name=basename(outfile),
                mail_user="cameron.prybol@gmail.com",
                logdir=mkpath("$(homedir())/workspace/slurmlogs"),
                mem_gb=mem_gb,
                cpus_per_task=threads,
                partition="lr7",
                account="pc_mfnanofabio",
                time="3-00:00:00",
                qos="lr_normal",
                cmd=cmd)
            # set this to be the runtime of the mapping jobs
            sleep(60)
        else
            @show isfile(outfile)
            @show Base.format_bytes(filesize(outfile))
        end
    end
end