In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "uCSV",
    "OrderedCollections",
    "CSV"
    # "XAM",
    # "CodecZlib"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
db = "nt"
path_to_db = joinpath(homedir(), "workspace", "blastdb", db)
# path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")
# compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
compressed_fasta_export = path_to_db * ".fna.gz"

In [None]:
threads = 12
mem_gb = threads * 32
# mem_gb = # Int(floor(mem_gb * .8))

In [None]:
sequencing_runs = readdir(joinpath(data_dir, "Shared-ME-CFS", "destination-path"), join=true)

In [None]:
# TODO prioritize samples by top level cohort THEN by sample size
# 1 FJ
# 2 FJMZ
# 3 E (external cohort)
# 4 UK (validation)

In [None]:
sample_metadata = DataFrames.DataFrame(forward = String[], reverse = String[], outdir = String[], joint_filesize = Int[])
for sequencing_run in sequencing_runs
    raw_data_dir = joinpath(sequencing_run, "raw_data")
    if isdir(raw_data_dir)
        sample_directories = readdir(raw_data_dir, join=true)
    else
        raw_data_dir = joinpath(sequencing_run, "01.RawData")
        @assert isdir(raw_data_dir)
        sample_directories = readdir(raw_data_dir, join=true)
    end
    for sample_directory in sample_directories
        fastq_files = readdir(sample_directory, join=true)
        lanes = unique(replace.(fastq_files, r"_[12]\.fq\.gz" => ""))
        for lane in lanes
            if !occursin(r"d7c3CaC3$", lane)
                forward = lane * "_1.fq.gz"
                @assert isfile(forward) forward
                reverse = lane * "_2.fq.gz"
                @assert isfile(reverse) reverse
                outdir = replace(lane, "Shared-ME-CFS/destination-path" => "local-ME-CFS")
                joint_filesize = filesize(forward) + filesize(reverse)
                row = (;forward, reverse, outdir, joint_filesize)
                push!(sample_metadata, row)
            else
                @show lane
            end
        end
    end
end
sample_metadata

# # sort SRA dirs by size so that smallest jobs will run first
# sample_metadata = sort(sample_metadata, "joint_filesize")

In [None]:
sample_metadata[!, "sample_ID"] = map(x -> basename(dirname(x)), sample_metadata[!, "outdir"])

sample_metadata[!, "cohort_ID"] = map(x -> first(match(r"^([A-Z]+).*", x).captures), sample_metadata[!, "sample_ID"])

cohort_id_to_priority = Dict(
    "FJ" => 1,
    "FJMZ" => 2,
    "E" => 3,
    "UK" => 4
)
sample_metadata[!, "cohort_priority"] = map(x -> cohort_id_to_priority[x], sample_metadata[!, "cohort_ID"])

In [None]:
sort!(sample_metadata, ["cohort_priority", "sample_ID", "joint_filesize"])

In [None]:
# i = 1
# i = 2
# i = 3
# i = 4
# i = 5
# i = 6
# i = 7
# done through above
# next to submit is below
i = 8
N=min(2^i, DataFrames.nrow(sample_metadata))

In [None]:
# done_count = 0
# for row in DataFrames.eachrow(sample_metadata[1:N, :])
#     forward = row["forward"]
#     reverse = row["reverse"]
#     outdir = row["outdir"]
    
#     if !isdir(outdir)
#         # @show "making path $(outdir)"
#         mkpath(outdir)
#     end
    
#     map_result = Mycelia.minimap_map_paired_end_with_index(
#         fasta = compressed_fasta_export,
#         outdir = outdir,
#         forward = forward,
#         reverse = reverse,
#         mem_gb = Int(floor(mem_gb * .8)),
#         threads = threads,
#         denominator = 6,
#         as_string = true
#     )
    
#     outfile = map_result.outfile
#     cmd = map_result.cmd
    
#     # Check if the output file exists
#     if !isfile(outfile)
#         @show done_count
#     else
#         done_count += 1
#     end
# end

In [None]:
# udpate to submit 8, 12, 16, etc or something per day
N_submissions = 12

In [None]:
# In this updated code:
# 1. A counter `successful_submissions` is initialized to track the number of successful submissions.
# 2. This counter is incremented each time a successful submission is detected (i.e., when the output file exists).
# 3. The loop checks if the number of successful submissions has reached `N` and stops the loop if the condition is met.

# This way, the code will continue to submit jobs until `N` successful submissions are achieved and then terminate.

successful_submissions = 0  # Initialize counter for successful submissions

for row in DataFrames.eachrow(sample_metadata[1:N, :])
    # Stop the loop if we've reached N successful submissions
    if successful_submissions >= N_submissions
        println("Reached $(N_submissions) successful submissions. Stopping.")
        break
    end
    
    forward = row["forward"]
    reverse = row["reverse"]
    outdir = row["outdir"]
    
    if !isdir(outdir)
        # @show "making path $(outdir)"
        mkpath(outdir)
    end
    
    map_result = Mycelia.minimap_map_paired_end_with_index(
        fasta = compressed_fasta_export,
        outdir = outdir,
        forward = forward,
        reverse = reverse,
        mem_gb = Int(floor(mem_gb * .8)),
        threads = threads,
        denominator = 6,
        as_string = true
    )
    
    outfile = map_result.outfile
    cmd = map_result.cmd
    
    # Check if the output file exists
    if !isfile(outfile)
        println("submitting job to produce $(outfile)")
        Mycelia.scg_sbatch(
            job_name=basename(outfile),
            mail_user="cameron.prybol@gmail.com",
            logdir=mkpath("$(homedir())/workspace/slurmlogs"),
            mem_gb=mem_gb,
            cpus_per_task=threads,
            partition="nih_s10",
            account="mpsnyder",
            time="7-00:00:00",
            cmd=cmd
        )
        sleep(60)  # Pause for 60 seconds
        successful_submissions += 1
    else
        @show outfile
        @show Base.format_bytes(filesize(outfile))
    end
end

In [None]:
# for row in DataFrames.eachrow(sample_metadata[1:N, :])
#     forward = row["forward"]
#     reverse = row["reverse"]
#     outdir = row["outdir"]
#     if !isdir(outdir)
#         @show "making path $(outdir)"
#         mkpath(outdir)
#     end
#     map_result = Mycelia.minimap_map_paired_end_with_index(fasta = compressed_fasta_export, outdir = outdir, forward = forward, reverse =reverse, mem_gb = Int(floor(mem_gb * .8)), threads=threads, denominator = 6, as_string=true)
#     outfile = map_result.outfile
#     cmd = map_result.cmd
#     # resubmit if we run out of time
#     if !isfile(outfile)
#         println(outfile)
#         Mycelia.scg_sbatch(
#             job_name=basename(outfile),
#             mail_user="cameron.prybol@gmail.com",
#             logdir=mkpath("$(homedir())/workspace/slurmlogs"),
#             mem_gb=mem_gb,
#             cpus_per_task=threads,
#             partition="nih_s10",
#             account="mpsnyder",
#             time="7-00:00:00",
#             cmd=cmd)
#         # make longer?
#         sleep(60)
#     else
#         @show isfile(outfile)
#         @show Base.format_bytes(filesize(outfile))
#     end
# end