In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
# @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    # "FASTX",
    # "StatsBase",
    # "Distributions",
    # "StatsPlots",
    # "Random",
    # "Dates",
    # "DataFrames",
    # "BioSequences",
    # "Conda",
    # "Downloads"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
fasta_files = sort(filter(x -> occursin(r"\.fna$", x) && !occursin(".vcf", x) && !occursin("normalized", x), readdir(genome_dir, join=true)), by=x->filesize(x))

In [None]:
short_read_sets = unique(map(x -> match(r"^(.+\.\d+x)\.", x).captures[1], filter(x -> occursin(r"\.fna\.art", x) && occursin(r"\.fq\.gz", x) && !occursin("trimming_report", x) && !occursin("_val_", x), readdir(genome_dir, join=true))))

In [None]:
threads = 8
# memory = 64
for short_read_set in short_read_sets
    forward = short_read_set * ".1_val_1.fq.gz"
    reverse = short_read_set * ".2_val_2.fq.gz"
    # @show forward reverse
    outdir = dirname(short_read_set)
    sam_outfile = short_read_set * ".minimap2.sam"
    outfile = replace(sam_outfile, ".sam" => ".sorted.bam")
    # DON'T USE MODIFIED FASTA!!
    reference_fasta = replace(short_read_set, r"\.normalized\.vcf\.fna\.art\..*$" => "")
    # @show reference_fasta
    # @show outfile
    if !isfile(outfile) || (filesize(outfile) == 0)
        @info "$(outfile) not present, generating..."
        # https://github.com/lh3/minimap2/blob/master/FAQ.md#3-the-output-sam-doesnt-have-a-header
        # can use index prefix to split the memory
        # cmd =
        # """
        # $(Mycelia.MAMBA) run --live-stream -n minimap2 minimap2 -t $(threads) -ax sr $(reference_fasta) $(forward) $(reverse) --split-prefix=$(sam_outfile).tmp -o $(sam_outfile) \\
        # && $(Mycelia.MAMBA) run --live-stream -n samtools samtools sort --threads $(threads) $(sam_outfile) \\
        # | $(Mycelia.MAMBA) run --live-stream -n samtools samtools view --bam --with-header --output $(outfile) \\
        # && rm $(sam_outfile)
        # """
        # Mycelia.sbatch(
        #     job_name = "$(basename(outfile))",
        #     mail_user = "cameron.prybol@gmail.com",
        #     logdir = mkpath("$(homedir())/workspace/slurmlogs"),
        #     partition = "batch",
        #     account = "mpsnyder",
        #     mem_gb = memory,
        #     cpus_per_task= threads,
        #     cmd = cmd)
        cmd =
        """
        $(Mycelia.MAMBA) run --live-stream -n minimap2 minimap2 -t $(threads) -ax sr $(reference_fasta) $(forward) $(reverse) --split-prefix=$(sam_outfile).tmp -o $(sam_outfile) \\
        && $(Mycelia.MAMBA) run --live-stream -n samtools samtools sort --threads $(threads) $(sam_outfile) \\
        | $(Mycelia.MAMBA) run --live-stream -n samtools samtools view -bh -o $(outfile) \\
        && rm $(sam_outfile)
        """
        Mycelia.nersc_sbatch(
            job_name = "$(basename(outfile))",
            mail_user = "cameron.prybol@gmail.com",
            logdir = mkpath("$(homedir())/workspace/slurmlogs"),
            qos = "shared",
            cpus_per_task= threads,
            cmd = cmd)
    else
        @info "$(outfile) already present..."
    end
end

In [None]:
long_read_fastqs = sort(filter(x -> occursin(r"\.filtlong\.fq\.gz$", x), readdir(genome_dir, join=true)), by=x->filesize(x))

In [None]:
threads = 8
# memory = 64
for long_read_fastq in long_read_fastqs
    sam_outfile = long_read_fastq * ".minimap2.sam"
    # DON'T USE MODIFIED FASTA!
    reference_fasta = replace(long_read_fastq, r"\.normalized\.vcf\.fna\.badread\.\d+x\.filtlong\.fq\.gz" => "")
    # @show reference_fasta
    outfile = replace(sam_outfile, ".sam" => ".sorted.bam")
    if !isfile(outfile) || (filesize(outfile) == 0)
        # https://github.com/lh3/minimap2/blob/master/FAQ.md#3-the-output-sam-doesnt-have-a-header
        # can use index prefix to split the memory
        @info "$(outfile) not present, generating..."
        cmd =
        """
        $(Mycelia.MAMBA) run --live-stream -n minimap2 minimap2 -t $(threads) -ax map-pb $(reference_fasta) $(long_read_fastq) --split-prefix=$(sam_outfile).tmp -o $(sam_outfile) \\
        && $(Mycelia.MAMBA) run --live-stream -n samtools samtools sort --threads $(threads) $(sam_outfile) \\
        | $(Mycelia.MAMBA) run --live-stream -n samtools samtools view -bh -o $(outfile) \\
        && rm $(sam_outfile)
        """
        # Mycelia.sbatch(
        #     job_name = "$(basename(outfile))",
        #     mail_user = "cameron.prybol@gmail.com",
        #     logdir = mkpath("$(homedir())/workspace/slurmlogs"),
        #     partition = "batch",
        #     account = "mpsnyder",
        #     mem_gb = memory,
        #     cpus_per_task= threads,
        #     cmd = cmd)
        Mycelia.nersc_sbatch(
            job_name = "$(basename(outfile))",
            mail_user = "cameron.prybol@gmail.com",
            logdir = mkpath("$(homedir())/workspace/slurmlogs"),
            qos = "shared",
            cpus_per_task= threads,
            cmd = cmd)
    else
        @info "$(outfile) already present..."
    end
end