In [None]:
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "uCSV"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
viral_reference_fastas_dir = mkpath(joinpath(data_dir, "viral-reference-fastas"))
reference_fastas = sort(filter(x -> occursin(r"\.f(na|asta)\.gz$", x), readdir(viral_reference_fastas_dir, join=true)), by=x->filesize(x))
reference_fastas = filter(x -> !occursin(r"revised"i, x), reference_fastas)

In [None]:
slurm_logdir = mkpath("$(homedir())/workspace/slurmlogs")

In [None]:
# long_read_fastas = filter(x -> occursin(r"\.bam\.fasta$", x), readdir(joinpath(data_dir, "exposome_data", "joint-reads"), join=true))
# sort!(long_read_fastas, by=x->filesize(x))

# /home/cjprybol/.julia/conda/3/x86_64/bin/mamba run --live-stream -n samtools samtools view workspace/Mycelia/projects/viral-exposome/data/exposome_data/joint-reads/alignments/ictv-examplar-viruses.fna.gz.m84085_231013_185624_s2.hifi_reads.bc1004.bam.fasta.minimap2.sorted.bam | less

# threads=8
# memory=64
# # 1-3 complete
# # 4 seems to be working
# for reference_fasta in reference_fastas[5:5]
#     for long_read_fasta in long_read_fastas
#         outdir = mkpath(joinpath(data_dir, "exposome_data", "joint-reads", "alignments"))
#         sam_outfile = joinpath(outdir, basename(reference_fasta) * "." * basename(long_read_fasta) * ".minimap2.sam")
#         outfile = replace(sam_outfile, ".sam" => ".sorted.bam")
#         # @show outfile
#         if !isfile(outfile) || (filesize(outfile) == 0)
#             # https://github.com/lh3/minimap2/blob/master/FAQ.md#3-the-output-sam-doesnt-have-a-header
#             # can use index prefix to split the memory
#             cmd =
#             """
#             $(Mycelia.MAMBA) run --live-stream -n minimap2 minimap2 -t $(threads) -ax map-pb $(reference_fasta) $(long_read_fasta) --split-prefix=$(sam_outfile).tmp -o $(sam_outfile) \\
#             && $(Mycelia.MAMBA) run --live-stream -n samtools samtools sort -u --threads $(threads) $(sam_outfile) \\
#             | $(Mycelia.MAMBA) run --live-stream -n samtools samtools view --bam --with-header --output $(outfile) \\
#             && rm $(sam_outfile)
#             """
#             # @show cmd
#             Mycelia.sbatch(
#                 job_name = "$(basename(outfile))",
#                 mail_user = "cameron.prybol@gmail.com",
#                 logdir = slurm_logdir,
#                 partition = "batch",
#                 account = "mpsnyder",
#                 mem_gb = memory,
#                 cpus_per_task= threads,
#                 cmd = cmd)
#         else
#             @info "$(outfile) already present..."
#         end
#     end
# end

In [None]:
exposome_metadata_dir = joinpath(project_dir, "metadata")
exposome_environmental_data = DataFrames.DataFrame(uCSV.read(
    joinpath(exposome_metadata_dir, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

joint_sample_metadata = DataFrames.DataFrame(uCSV.read(
    joinpath(exposome_metadata_dir, "exposome/joint_sample_metadata.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

@assert joint_sample_metadata[!, "Library Name"] == joint_sample_metadata[!, "LibraryName"]

joint_metadata = DataFrames.innerjoin(
    joint_sample_metadata,
    exposome_environmental_data,
    on="Library Name" => "samplenames")

short_read_sample_directories = sort(joinpath.(project_dir, "data", "SRA", joint_metadata[!, "Run"]))

# 64 ictv-examplar-viruses.fna.gz
# 64 ref_viruses_rep_genomes.fasta.gz
# fasta_file = reference_fastas[2]
# fasta_file = reference_fastas[3]
# memory = 64
# threads = 8

# memory = 64
# threads = 8
# 128 GCF_000001405.40_GRCh38.p14_genomic.fna.gz

In [None]:
# reference_fasta = "/oak/stanford/scg/lab_mpsnyder/cjprybol/Mycelia/projects/viral-exposome/data/viral-reference-fastas/IMGVR_all_nucleotides-high_confidence.fna.gz"
reference_fasta = "/oak/stanford/scg/lab_mpsnyder/cjprybol/Mycelia/projects/viral-exposome/data/viral-reference-fastas/nt_viruses.fasta.gz"
for sample in short_read_sample_directories
    forward = joinpath(sample, "trim_galore", "$(basename(sample))_1_val_1.fq.gz")
    reverse = joinpath(sample, "trim_galore", "$(basename(sample))_2_val_2.fq.gz")
    outdir = mkpath(joinpath(sample, "alignments"))
    sam_outfile = joinpath(outdir, basename(reference_fasta) * "." * basename(sample) * ".minimap2.sam")
    outfile = replace(sam_outfile, ".sam" => ".sorted.bam")
    # @show outfile
    if !isfile(outfile) || (filesize(outfile) == 0)
        # https://github.com/lh3/minimap2/blob/master/FAQ.md#3-the-output-sam-doesnt-have-a-header
        # can use index prefix to split the memory
        cmd =
        """
        $(Mycelia.MAMBA) run --live-stream -n minimap2 minimap2 -t $(threads) -ax sr $(reference_fasta) $(forward) $(reverse) --split-prefix=$(sam_outfile).tmp -o $(sam_outfile) \\
        && $(Mycelia.MAMBA) run --live-stream -n samtools samtools sort -u --threads $(threads) $(sam_outfile) \\
        | $(Mycelia.MAMBA) run --live-stream -n samtools samtools view --bam --with-header --output $(outfile) \\
        && rm $(sam_outfile)
        """
        Mycelia.sbatch(
            job_name = "$(basename(outfile))",
            mail_user = "cameron.prybol@gmail.com",
            logdir = mkpath("$(homedir())/workspace/slurmlogs"),
            partition = "batch",
            account = "mpsnyder",
            mem_gb = memory,
            cpus_per_task= threads,
            cmd = cmd)
    else
        @info "$(outfile) already present..."
    end
end