In [None]:
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "uCSV"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
viral_reference_fastas_dir = mkpath(joinpath(data_dir, "viral-reference-fastas"))
reference_fastas = sort(filter(x -> occursin(r"\.f(na|asta)\.gz$", x), readdir(viral_reference_fastas_dir, join=true)), by=x->filesize(x))

In [None]:
slurm_logdir = mkpath("$(homedir())/workspace/slurmlogs")
exposome_metadata_dir = joinpath(project_dir, "metadata")
exposome_environmental_data = DataFrames.DataFrame(uCSV.read(
    joinpath(exposome_metadata_dir, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

joint_sample_metadata = DataFrames.DataFrame(uCSV.read(
    joinpath(exposome_metadata_dir, "exposome/joint_sample_metadata.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

@assert joint_sample_metadata[!, "Library Name"] == joint_sample_metadata[!, "LibraryName"]

joint_metadata = DataFrames.innerjoin(
    joint_sample_metadata,
    exposome_environmental_data,
    on="Library Name" => "samplenames")

sample_directories = sort(joinpath.(project_dir, "data", "SRA", joint_metadata[!, "Run"]))

# 64 ictv-examplar-viruses.fna.gz
# 64 ref_viruses_rep_genomes.fasta.gz
# fasta_file = reference_fastas[2]
# fasta_file = reference_fastas[3]
memory = 64
threads = 8

In [None]:
fasta_file = reference_fastas[1]

In [None]:
memory = 128
threads = 16
# 128 GCF_000001405.40_GRCh38.p14_genomic.fna.gz

In [None]:
for sample in sample_directories
    forward = joinpath(sample, "trim_galore", "$(basename(sample))_1_val_1.fq.gz")
    reverse = joinpath(sample, "trim_galore", "$(basename(sample))_2_val_2.fq.gz")
    outdir = mkpath(joinpath(sample, "alignments"))
    outfile = joinpath(outdir, basename(fasta_file) * ".sorted.bam")
    # @show outfile
    if !isfile(outfile) || (filesize(outfile) == 0)
        cmd =
        """
        $(Mycelia.MAMBA) run --live-stream -n bwa-mem2 bwa-mem2 mem -t $(threads) $(fasta_file) $(forward) $(reverse) \\
        | $(Mycelia.MAMBA) run --live-stream -n samtools samtools sort -u --threads $(threads) \\
        | $(Mycelia.MAMBA) run --live-stream -n samtools samtools view --bam --with-header --output $(outfile)
        """
        Mycelia.sbatch(
            job_name = "bwa-mem2.$(basename(fasta_file)).$(basename(sample))",
            mail_user = "cameron.prybol@gmail.com",
            logdir = slurm_logdir,
            partition = "batch",
            account = "mpsnyder",
            mem_gb = memory,
            cpus_per_task= threads,
            cmd = cmd)
    else
        @info "$(outfile) already present..."
    end
end