In [None]:
import Pkg
Pkg.activate(".")

# not for the faint of heart!
# Pkg.update()

pkgs = [
"ArgParse",
"Base64",
"BioSequences",
"DataFrames",
"Dates",
"DelimitedFiles",
"FASTX",
"GLM",
"HTTP",
"JSON",
"Graphs",
"MetaGraphs",
"MD5",
"Statistics",
"StatsPlots",
"uCSV",
"CodecZlib",
"YAML",
"Revise",
"Kmers",
"StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

Pkg.develop(path="../../..")
import Mycelia

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
kraken_db = "k2_pluspfp_20221209"

In [None]:
local_kraken_db_tar = joinpath(homedir(), "workspace", "kraken-databases", "$(kraken_db).tar.gz")

In [None]:
local_kraken_db = replace(local_kraken_db_tar, ".tar.gz" => "")

In [None]:
if !isdir(local_kraken_db)
    if !isfile(local_kraken_db_tar)
        run(
            `wget 
            --no-clobber
            --directory-prefix $(dirname(local_kraken_db_tar))
            https://genome-idx.s3.amazonaws.com/kraken/$(kraken_db).tar.gz
            `)
    end
    run(`tar -xvzf $(local_kraken_db_tar) --directory $(local_kraken_db)`)
else
    @info "kraken db found @ $(local_kraken_db)"
end

In [None]:
SRR_paths = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(joinpath(data_dir, "SRA"), join=true))

In [None]:
for SRR_path in SRR_paths
    SRR = basename(SRR_path)

    kraken_dir = mkpath(joinpath(SRR_path, "kraken"))

    output = joinpath(kraken_dir, "$(SRR).$(kraken_db).kraken-output.tsv")
    report = joinpath(kraken_dir, "$(SRR).$(kraken_db).kraken-report.tsv")
    krona_file = report * ".krona"
    krona_html = krona_file * ".html"

    trimmed_forward_reads = joinpath(SRR_path, "trim_galore", "$(SRR)_1_val_1.fq.gz")

    trimmed_reverse_reads = joinpath(SRR_path, "trim_galore", "$(SRR)_2_val_2.fq.gz")

    # Loading database information... done.
    # 5666 sequences (1.67 Mbp) processed in 0.106s (3221.1 Kseq/m, 948.50 Mbp/m).
    #   905 sequences classified (15.97%)
    #   4761 sequences unclassified (84.03%)

    # Loading database information... done.
    # 56807236 sequences (16754.54 Mbp) processed in 175.923s (19374.6 Kseq/m, 5714.28 Mbp/m).
    #   11939597 sequences classified (21.02%)
    #   44867639 sequences unclassified (78.98%)

    # Loading database information... done.
    # 75492257 sequences (22266.95 Mbp) processed in 398.346s (11370.9 Kseq/m, 3353.91 Mbp/m).
    #   20297310 sequences classified (26.89%)
    #   55194947 sequences unclassified (73.11%)

    # Loading database information... done.
    # 68081844 sequences (20094.20 Mbp) processed in 439.799s (9288.1 Kseq/m, 2741.37 Mbp/m).
    #   12585997 sequences classified (18.49%)
    #   55495847 sequences unclassified (81.51%)

    # 681  2023-02-18 23:26:12 wget https://github.com/DerrickWood/kraken2/archive/refs/tags/v2.1.2.tar.gz
    # 682  2023-02-18 23:26:17 tar -xvzf v2.1.2.tar.gz 
    # 683  2023-02-18 23:26:21 cd kraken2-2.1.2/
    # 684  2023-02-18 23:26:28 ./install_kraken2.sh 
    # 685  2023-02-18 23:26:34 ./install_kraken2.sh ./bin


    # 16 minutes @ 32 cores
    cmd =
    `
    /home/jovyan/software/kraken2-2.1.2/bin/kraken2
        --report-zero-counts
        --use-names
        --threads $(Sys.CPU_THREADS)
        --db $(local_kraken_db)
        --output $(output)
        --report $(report)
        --gzip-compressed
        --paired $(trimmed_forward_reads) $(trimmed_reverse_reads)
    `

    if !isfile(report)
        run(cmd)
    end
    if isfile(output)
        run(`pigz --best $(output)`)
    end
    if !isfile(krona_file)
        run(`python kreport2krona.py -r $(report) -o $(krona_file)`)
    end
    if !isfile(krona_html)
        run(`ktImportText $(krona_file) -o $(krona_html)`)
    end
    # run(`rclone copy SRR_kraken_directory google-drive`)
end