In [None]:
import Pkg
Pkg.activate(".")

# not for the faint of heart!
# Pkg.update()

pkgs = [
"ProgressMeter",
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

Pkg.develop(path="../../..")
import Mycelia

In [None]:
function download_and_filter_reads(;outdir="", srr_identifier="")
    
    forward_reads = joinpath(outdir, "$(srr_identifier)_1.fastq")
    reverse_reads = joinpath(outdir, "$(srr_identifier)_2.fastq")
    forward_reads_gz = forward_reads * ".gz"
    reverse_reads_gz = reverse_reads * ".gz"
    trimmed_forward_reads = joinpath(outdir, "trim_galore", "$(srr_identifier)_1_val_1.fq.gz")
    trimmed_reverse_reads = joinpath(outdir, "trim_galore", "$(srr_identifier)_2_val_2.fq.gz")

    if !(isfile(trimmed_forward_reads) && isfile(trimmed_reverse_reads))
        fasterq_dump(outdir=outdir, srr_identifier=srr_identifier)
        trim_galore(outdir=outdir, identifier=srr_identifier)
    end
    isfile(forward_reads_gz) && rm(forward_reads_gz)
    isfile(reverse_reads_gz) && rm(reverse_reads_gz)
end

In [None]:
function fasterq_dump(;outdir="", srr_identifier="")
    
    forward_reads = joinpath(outdir, "$(srr_identifier)_1.fastq")
    reverse_reads = joinpath(outdir, "$(srr_identifier)_2.fastq")
    
    forward_reads_gz = forward_reads * ".gz"
    reverse_reads_gz = reverse_reads * ".gz"
    
    if !isfile(forward_reads_gz) && !isfile(reverse_reads_gz)
        # --progress doesn't work well for jupyter output
        fasterq_dump_cmd = `
            fasterq-dump
                --outdir $(outdir)
                --mem 1G
                --split-3
                --threads $(min(Sys.CPU_THREADS, 4))
                --skip-technical
                --verbose
                $(srr_identifier)`
        @time run(fasterq_dump_cmd)
        run(`pigz $(forward_reads)`)
        run(`pigz $(reverse_reads)`)
    else
        @info "$(forward_reads_gz) & $(reverse_reads_gz) already present"
    end
end

In [None]:
function trim_galore(;outdir="", identifier="")
    
    trim_galore_dir = joinpath(outdir, "trim_galore")
    
    forward_reads = joinpath(outdir, "$(identifier)_1.fastq.gz")
    reverse_reads = joinpath(outdir, "$(identifier)_2.fastq.gz")
    
    trimmed_forward_reads = joinpath(trim_galore_dir, "$(identifier)_1_val_1.fq.gz")
    trimmed_reverse_reads = joinpath(trim_galore_dir, "$(identifier)_2_val_2.fq.gz")
    
    if !isfile(trimmed_forward_reads) && !isfile(trimmed_reverse_reads)
        cmd = `trim_galore --suppress_warn --cores $(min(Sys.CPU_THREADS, 4)) --output_dir $(trim_galore_dir) --paired $(forward_reads) $(reverse_reads)`
        run(cmd)
    else
        @info "$(trimmed_forward_reads) & $(trimmed_reverse_reads) already present"
    end
end

In [None]:
srr_list = "$(dirname(pwd()))/metadata/exposome/SraAccList.txt"
srr_identifiers = readlines(srr_list)

In [None]:
done = false
while !done
    ProgressMeter.@showprogress for srr_identifier in srr_identifiers[1:100]
        sample_outdir = "$(dirname(pwd()))/data/SRA/$(srr_identifier)"
        download_and_filter_reads(outdir=sample_outdir, srr_identifier=srr_identifier)
    end
    done = true
end