In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "uCSV",
    "OrderedCollections",
    "CSV"
    # "XAM",
    # "CodecZlib"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
db = "nt"
path_to_db = joinpath(homedir(), "workspace", "blastdb", db)
# path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")
# compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
compressed_fasta_export = path_to_db * ".fna.gz"

In [None]:
threads = 12
mem_gb = threads * 32
# mem_gb = # Int(floor(mem_gb * .8))

In [None]:
pilot_samples = readdir(joinpath(data_dir, "samples"), join=true)

In [None]:
pilot_sample = first(pilot_samples)
qc_filtered_batches = filter(x -> occursin(r"trimgalore$", x), readdir(pilot_sample, join=true))
qc_filtered_batch = first(qc_filtered_batches)
readdir(qc_filtered_batch, join=true)

In [None]:
pilot_sample_table = DataFrames.DataFrame(
    sample_ID = String[],
    forward_reads = String[],
    reverse_reads = String[],
    filesize = Int[]
)

# run on just non-human reads first
for pilot_sample in pilot_samples
    qc_filtered_batches = filter(x -> occursin(r"trimgalore$", x), readdir(pilot_sample, join=true))
    for qc_filtered_batch in qc_filtered_batches
        forward_reads = filter(x -> occursin(r"\.unmapped\.1\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        # forward_reads = filter(x -> occursin(r"1_val_1\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        @assert length(forward_reads) == 1
        forward_reads = first(forward_reads)
        reverse_reads = filter(x -> occursin(r"\.unmapped\.2\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        # reverse_reads = filter(x -> occursin(r"2_val_2\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        @assert length(reverse_reads) == 1
        reverse_reads = first(reverse_reads)
        joint_filesize = filesize(forward_reads) + filesize(reverse_reads)
        row = (sample_ID = pilot_sample, forward_reads = forward_reads, reverse_reads = reverse_reads, filesize = joint_filesize)
        push!(pilot_sample_table, row)
    end
end
pilot_sample_table

In [None]:
sort!(pilot_sample_table, "filesize")

In [None]:
for row in DataFrames.eachrow(pilot_sample_table)
    
    forward = row["forward_reads"]
    reverse = row["reverse_reads"]
    outdir = dirname(row["forward_reads"])
    
    map_result = Mycelia.minimap_map_paired_end_with_index(
        fasta = compressed_fasta_export,
        outdir = outdir,
        forward = forward,
        reverse = reverse,
        mem_gb = Int(floor(mem_gb * .8)),
        threads = threads,
        denominator = 6,
        as_string = true
    )
    
    outfile = map_result.outfile
    cmd = map_result.cmd
    
    # Check if the output file exists
    if !isfile(outfile)
        println("submitting job to produce $(outfile)")
        # Mycelia.scg_sbatch(
        #     job_name=basename(outfile),
        #     mail_user="cameron.prybol@gmail.com",
        #     logdir=mkpath("$(homedir())/workspace/slurmlogs"),
        #     mem_gb=mem_gb,
        #     cpus_per_task=threads,
        #     partition="nih_s10",
        #     account="mpsnyder",
        #     time="7-00:00:00",
        #     cmd=cmd
        # )
        # sleep(60)  # Pause for 60 seconds
    else
        @show outfile
        @show Base.format_bytes(filesize(outfile))
    end
end