In [None]:
import Pkg
Pkg.activate(".")

# not for the faint of heart!
# Pkg.update()

pkgs = [
"ArgParse",
"Base64",
"BioSequences",
"DataFrames",
"Dates",
"DelimitedFiles",
"FASTX",
"GLM",
"HTTP",
"JSON",
"Graphs",
"MetaGraphs",
"MD5",
"Statistics",
"StatsPlots",
"uCSV",
"CodecZlib",
"YAML",
"Revise",
"Kmers",
"StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
# MMSeqs Easy taxonomy
# amino acid
# ~ 5 hour download time may not be tolerable?
# processing
# 7930.350543 seconds (710.71 k allocations: 40.309 MiB, 0.00% gc time, 0.00% compilation time)
# 2 hours 12m
# 1.5 hours for iterative search with varying sensitivity
Mycelia.download_mmseqs_db(db="UniRef100", conda_env="viral-pangenome-discovery")
# mmseqs databases --compressed 1 --remove-tmp-files 1 --force-reuse 1 UniRef100 $HOME/mmseqs/UniRef100 $HOME/mmseqs/tmp

In [None]:
# cpan -f App::cpan minus
# cpanm List::MoreUtils
# mamba install perl-list-moreutils

blastdb_dir = "$(homedir())/blastdb"
blast_db = "nt"
# if isdir(blastdb_dir) && !isempty(readdir(blastdb_dir))
    # @info "blast db detected, using existing"
Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="ncbi")
    # Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir)
# else
    # we're probably on a cloud build
    # Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="gcp")
# end

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
SRR_paths = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(joinpath(data_dir, "SRA"), join=true))

In [None]:
SRR_path = SRR_paths[1]
# SRR_path = SRR_paths[2]
# SRR_path = SRR_paths[3]

In [None]:
SRR = basename(SRR_path)

In [None]:

Mycelia.run_mmseqs_easy_search(out_dir=OUT_DIR, query_fasta=amino_acid_fasta, target_database="$(homedir())/mmseqs/UniRef100", outfile=config["annotation run identifier"])
# add taxonomic information to uniref classifications above!

In [None]:
out_dir = joinpath(SRR_path, "blast")

In [None]:
db_path = joinpath(blastdb_dir, blast_db)
# blast contigs against NCBI
ncbi_blast_outfile = "$(assembled_fasta).blastn.$(blast_db).txt"

# >= 1-2 hours to run remotely
# 1679.701928 seconds
# 3497.275545
 # 28m 0.44s
Mycelia.run_blast

# join the blast results to generate the contig info file
ncbi_blast_results = Mycelia.parse_blast_report(ncbi_blast_outfile)

if !isfile("taxdump.tar.gz")
    run(`wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz`)
end
if !isdir("taxdump")
    mkdir("taxdump")
    run(`tar -xvzf taxdump.tar.gz --directory taxdump`)
end

detected_tax_id_file = "$(OUT_DIR)/$(config["assembly run identifier"]).detected_tax_ids.txt"
open(detected_tax_id_file, "w") do io
    for taxid in unique(filter(!ismissing, ncbi_blast_results[!, "subject tax id"]))
        println(io, taxid)
    end
end

taxid_to_lineage_map = Dict(parse(Int, split_line[1]) => split_line[2] for split_line in split.(readlines(`taxonkit lineage --data-dir taxdump $(detected_tax_id_file)`), '\t'))

ncbi_blast_results[!, "lineage"] = map(x -> get(taxid_to_lineage_map, x, ""), ncbi_blast_results[!, "subject tax id"])

ncbi_blast_results[!, "% of subject length"] = round.(ncbi_blast_results[!, "query length"] ./ ncbi_blast_results[!, "subject length"] * 100, digits=3)
contig_info_table = DataFrames.leftjoin(qualimap_contig_coverage_table, ncbi_blast_results, on="Contig" => "query id")

# # get top 10 hits for each contig
# contig_info_table_top_hits = 
# DataFrames.combine(DataFrames.groupby(contig_info_table, "Contig")) do gdf
#    first(sort(gdf, "bit score", rev=true), 10)
# end

# re-order columns based on utility
reordered_columns = [
    "Contig",
    "Length",
    "Mapped bases",
    "Mean coverage",
    "Standard Deviation",
    "% Mapped bases",
    "subject id",
    "subject acc.",
    "subject title",
    "subject tax id",
    "lineage",
    "% identity",
    "% of subject length",
    "evalue",
    "bit score",
    "query length",
    "subject length",
    "alignment length",
    "q. start",
    "q. end",
    "s. start",
    "s. end",
    "identical",
    "mismatches"
]
contig_info_table_top_hits = contig_info_table_top_hits[!, reordered_columns]
sort!(contig_info_table_top_hits, ["% Mapped bases", "bit score"], rev=true)

contig_info_csv = "$(OUT_DIR)/$(config["assembly run identifier"]).config_info.csv"
contig_info_tsv = "$(OUT_DIR)/$(config["assembly run identifier"]).config_info.tsv"
uCSV.write(contig_info_csv, contig_info_table_top_hits, quotes='"')
uCSV.write(contig_info_tsv, contig_info_table_top_hits)
results["contig info"] = contig_info_csv