In [None]:
import Pkg
Pkg.activate(".")

# not for the faint of heart!
# Pkg.update()

pkgs = [
"ArgParse",
"Base64",
"BioSequences",
"DataFrames",
"Dates",
"DelimitedFiles",
"FASTX",
"GLM",
"HTTP",
"JSON",
"Graphs",
"MetaGraphs",
"MD5",
"Statistics",
"StatsPlots",
"uCSV",
"CodecZlib",
"YAML",
"Revise",
"Kmers",
"StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
# cpan -f App::cpan minus
# cpanm List::MoreUtils
# mamba install perl-list-moreutils

blastdb_dir = "$(homedir())/workspace/blastdb"
blast_db = "nt"
if isdir(blastdb_dir) && !isempty(readdir(blastdb_dir))
    @info "blast db detected, using existing"
else
    # Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="ncbi", conda_env="viral-pangenome-discovery")
    Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="ncbi")
end

# move me up to the initial download location
taxdump_tar = joinpath(blastdb_dir, "taxdump.tar.gz")
if !isfile(taxdump_tar)
    run(`wget --quiet https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz`)
end
taxdump_dir = mkpath(joinpath(blastdb_dir, "taxdump"))
if isempty(readdir(taxdump_dir))
    run(`tar -xvzf taxdump.tar.gz --directory $(taxdump_dir)`)
end

In [None]:
SRR_paths = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(joinpath(data_dir, "SRA"), join=true))
SRR_paths = filter(x -> isfile(joinpath(x, "megahit", "final.contigs.fastg.gfa.fna")), SRR_paths)
SRR_paths = filter(x -> !isfile(joinpath(x, basename(x) * ".contig_info.tsv")), SRR_paths)

In [None]:
for SRR_path in SRR_paths
    SRR = basename(SRR_path)
    @show SRR
    contig_info_tsv = "$(SRR_path)/$(SRR).contig_info.tsv"
    if isfile(contig_info_tsv)
        @info "$(contig_info_tsv) already present, skipping..."
    else
        assembled_fasta = joinpath(SRR_path, "megahit", "final.contigs.fastg.gfa.fna")

        blast_db = "nt"
        blastdb_dir = "$(homedir())/workspace/blastdb"
        db_path = joinpath(blastdb_dir, blast_db)
        @time blast_report = Mycelia.run_blast(out_dir = SRR_path, fasta = assembled_fasta, blast_db = db_path, blast_command = "blastn")
        # join the blast results to generate the contig info file
        ncbi_blast_results = Mycelia.parse_blast_report(blast_report)
        detected_tax_id_file = "$(assembled_fasta).detected_tax_ids.txt"
        open(detected_tax_id_file, "w") do io
            for taxid in unique(filter(!ismissing, ncbi_blast_results[!, "subject tax id"]))
                println(io, taxid)
            end
        end
        taxid_to_lineage_map = Dict(parse(Int, split_line[1]) => split_line[2] for split_line in split.(readlines(`taxonkit lineage --data-dir $(taxdump_dir) $(detected_tax_id_file)`), '\t'))

        qualimap_report_txt = joinpath(SRR_path, "megahit", "qualimap", "genome_results.txt")
        qualimap_contig_coverage_table = Mycelia.parse_qualimap_contig_coverage(qualimap_report_txt)

        ncbi_blast_results[!, "lineage"] = map(x -> get(taxid_to_lineage_map, x, ""), ncbi_blast_results[!, "subject tax id"])
        ncbi_blast_results[!, "% of subject length"] = round.(ncbi_blast_results[!, "query length"] ./ ncbi_blast_results[!, "subject length"] * 100, digits=3)
        contig_info_table = DataFrames.leftjoin(qualimap_contig_coverage_table, ncbi_blast_results, on="Contig" => "query id")

        # re-order columns based on utility
        reordered_columns = [
            "Contig",
            "Length",
            "Mapped bases",
            "Mean coverage",
            "Standard Deviation",
            "% Mapped bases",
            "subject id",
            "subject acc.",
            "subject title",
            "subject tax id",
            "lineage",
            "% identity",
            "% of subject length",
            "evalue",
            "bit score",
            "query length",
            "subject length",
            "alignment length",
            "q. start",
            "q. end",
            "s. start",
            "s. end",
            "identical",
            "mismatches"
        ]
        contig_info_table = contig_info_table[!, reordered_columns]
        sort!(contig_info_table, ["% Mapped bases", "bit score"], rev=true)

        # consider gzipping for large files!
        uCSV.write(contig_info_tsv, contig_info_table)
    end
end