In [None]:
import Pkg
Pkg.activate(".")
# Pkg.add("Revise")
# import Revise

# Pkg.develop(path="../../..")
# import Mycelia

pkgs = [
"DataFrames",
"uCSV",
"ProgressMeter",
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
SRR_paths = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(joinpath(data_dir, "SRA"), join=true))

In [None]:
function parse_mmseqs_easy_taxonomy_lca_tsv(lca_tsv)
    data, header = uCSV.read(lca_tsv, delim='\t')
    # contig
    # (1) a single taxonomy numeric identifier
    # (2) a taxonomic rank column
    # (3) taxonomic name column
    # fragments retained
    # fragments taxonomically assigned
    # fragments in agreement with the contig label (i.e. same taxid or have it as an ancestor)
    # the support received -log(E-value)
    header = [
        "contig_id",
        "taxon_id",
        "taxon_rank",
        "taxon_name",
        "fragments_retained",
        "fragments_taxonomically_assigned",
        "fragments_in_agreement_with_assignment",
        "support -log(E-value)"
    ]
    table = DataFrames.DataFrame(data, header)
    table[!, "contig_id"] = string.(table[!, "contig_id"])
    return table
end

In [None]:
function parse_qualimap_contig_coverage(qualimap_report_txt)
    coverage_line_regex = r"\t.*?\t\d+\t\d+\t[\d\.]+\t[\d\.]+$"
    lines = filter(x -> occursin(coverage_line_regex, x), readlines("$(qualimap_report_txt)"))
    io = IOBuffer(join(map(x -> join(split(x, '\t')[2:end], '\t'), lines), '\n'))
    header = ["Contig", "Length", "Mapped bases", "Mean coverage", "Standard Deviation"]
    types = [String, Int, Int, Float64, Float64]
    data, _ = uCSV.read(io, delim='\t', types=types)
    qualimap_results = DataFrames.DataFrame(data, header)
    qualimap_results[!, "% Mapped bases"] = qualimap_results[!, "Mapped bases"] ./ sum(qualimap_results[!, "Mapped bases"]) .* 100
    return qualimap_results
end

In [None]:
# function parse_blast_report(blast_report)
#     # example header line 
#     # "# Fields: query id, subject id, subject acc., subject acc.ver, subject title, query length, subject length, q. start, q. end, s. start, s. end, evalue, bit score, score, alignment length, % identity, identical, mismatches, subject tax id"
#     header_lines = collect(Iterators.filter(x -> occursin(r"# Fields:", x), eachline(blast_report)))
#     if isempty(header_lines)
#         @info "no hits found, returning empty table"
#         return DataFrames.DataFrame()
#     end
#     header_line = first(header_lines)
#     header = split(last(split(header_line, ": ")), ", ")
#     blast_col_types = Dict(
#         "query id" => String,
#         "query title" => String,
#         "subject id" => String,
#         "subject gi" => String,
#         "subject acc." => String,
#         "subject acc.ver" => String,
#         "subject title" => String,
#         "query length" => Int,
#         "subject length" => Int,
#         "q. start" => Int,
#         "q. end" => Int,
#         "s. start" => Int,
#         "s. end" => Int,
#         "evalue" => Float64,
#         "bit score" => Float64,
#         "score" => Float64,
#         "alignment length" => Int,
#         "% identity" => Float64,
#         "identical" => Int,
#         "mismatches" => Int,
#         "subject tax id" => Int,
#         "subject sci name" => String,
#         "subject com names" => String,
#         "subject blast name" => String,
#         "subject super kingdom" => String,
#         "subject tax ids" => String,
#         "subject sci names" => String,
#         "subject com names" => String,
#         "subject blast names" => String,
#         "subject super kingdoms" => String,
#         "subject title" => String,
#         "subject titles" => String
#     )
#     data, _ = uCSV.read(
#         blast_report,
#         delim='\t',
#         comment='#',
#         # skipmalformed=true,
#         allowmissing=true,
#         encodings=Dict("N/A" => missing),
#         types=[blast_col_types[h] for h in header])
#     return DataFrames.DataFrame(data, header, makeunique=true)
# end

In [None]:
function parse_blast_report(blast_report, top_n = 1)
    top_n_contig_hits = Dict{String, Vector{String}}()
    # example header line 
    # "# Fields: query id, subject id, subject acc., subject acc.ver, subject title, query length, subject length, q. start, q. end, s. start, s. end, evalue, bit score, score, alignment length, % identity, identical, mismatches, subject tax id"
    header_line = first(Iterators.filter(x -> occursin(r"# Fields:", x), eachline(blast_report)))
    header = split(last(split(header_line, ": ")), ", ")
    blast_col_types = Dict(
        "query id" => String,
        "query title" => String,
        "subject id" => String,
        "subject gi" => String,
        "subject acc." => String,
        "subject acc.ver" => String,
        "subject title" => String,
        "query length" => Int,
        "subject length" => Int,
        "q. start" => Int,
        "q. end" => Int,
        "s. start" => Int,
        "s. end" => Int,
        "evalue" => Float64,
        "bit score" => Float64,
        "score" => Float64,
        "alignment length" => Int,
        "% identity" => Float64,
        "identical" => Int,
        "mismatches" => Int,
        "subject tax id" => Int,
        "subject sci name" => String,
        "subject com names" => String,
        "subject blast name" => String,
        "subject super kingdom" => String,
        "subject tax ids" => String,
        "subject sci names" => String,
        "subject com names" => String,
        "subject blast names" => String,
        "subject super kingdoms" => String,
        "subject title" => String,
        "subject titles" => String
    )
    for line in Iterators.filter(x -> !occursin(r"^#", x), eachline(blast_report))
        contig = first(split(line, '\t'))
        if !haskey(top_n_contig_hits, contig)
            top_n_contig_hits[contig] = [line]
        elseif length(top_n_contig_hits[contig]) < top_n
            push!(top_n_contig_hits[contig], line)
        end
    end
    reconstructed_file = join([join(value, '\n') for value in values(top_n_contig_hits)], '\n')
    data, _ = uCSV.read(
        IOBuffer(reconstructed_file),
        delim='\t',
        # comment='#',
        # skipmalformed=true,
        allowmissing=true,
        encodings=Dict("N/A" => missing),
        types=[blast_col_types[h] for h in header])
    return DataFrames.DataFrame(data, header, makeunique=true)
end

In [None]:
# ProgressMeter.@showprogress for SRR_path in SRR_paths
#     # SRR_path = first(SRR_paths)
#     qualimap_coverage_table = parse_qualimap_contig_coverage(joinpath(SRR_path, "megahit", "qualimap", "genome_results.txt"))
#     mmseqs_lca_files = filter(x -> occursin("_lca.tsv", x) && occursin("final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy", x), readdir(joinpath(SRR_path, "mmseqs_easy_taxonomy"), join=true))

#     # mmseqs_lca_file = first(mmseqs_lca_files)
#     for mmseqs_lca_file in mmseqs_lca_files

#         parse_mmseqs_easy_taxonomy_lca_tsv(mmseqs_lca_file)
#         lca_table = parse_mmseqs_easy_taxonomy_lca_tsv(mmseqs_lca_file)
#         contig_coverage_and_classification_table = DataFrames.leftjoin(qualimap_coverage_table, lca_table, on="Contig" => "contig_id")
#         contig_coverage_and_classification_table[!, "taxon_id"] = map(x -> ismissing(x) ? 0 : x, contig_coverage_and_classification_table[!, "taxon_id"])
#         contig_coverage_and_classification_table = ifelse.(ismissing.(contig_coverage_and_classification_table), "", contig_coverage_and_classification_table)

#         uCSV.write(
#             replace(mmseqs_lca_file, r"\.tsv$" => ".coverage-and-classification.tsv"),
#             contig_coverage_and_classification_table,
#             delim='\t'
#         )
#     end
# end

In [None]:
nt_blast_report = "final.contigs.fastg.gfa.fna.blastn.nt.megablast.txt"
ProgressMeter.@showprogress for (i, SRR_path) in enumerate(SRR_paths)
    # @show i, SRR_path
    
    nt_blast_report_path = joinpath(SRR_path, "blastn", nt_blast_report)
    outfile = replace(nt_blast_report_path, r"\.txt$" => ".coverage-and-classification.tsv")
    if !isfile(nt_blast_report_path)
        @info "need to run blast $(basename(SRR_path))"
        # /oak/stanford/scg/lab_mpsnyder/cjprybol/Mycelia/projects/viral-exposome-discovery/data/SRA/SRR6399589/blastn/final.contigs.fastg.gfa.fna.blastn.nt.megablast.txt
    elseif !isfile(outfile)
        # try
        qualimap_coverage_table = parse_qualimap_contig_coverage(joinpath(SRR_path, "megahit", "qualimap", "genome_results.txt"))

        blast_table = parse_blast_report(joinpath(SRR_path, "blastn", nt_blast_report))

        contig_coverage_and_classification_table = DataFrames.leftjoin(qualimap_coverage_table, blast_table, on="Contig" => "query id")

        if "subject tax id" in names(contig_coverage_and_classification_table)
            contig_coverage_and_classification_table[!, "subject tax id"] = map(x -> ismissing(x) ? 0 : x, contig_coverage_and_classification_table[!, "subject tax id"])
        else
            contig_coverage_and_classification_table[!, "subject tax id"] = map(x -> (ismissing(x) || isempty(x)) ? 0 : parse(Int, first(split(x, ';'))), contig_coverage_and_classification_table[!, "subject tax ids"])
        end
        contig_coverage_and_classification_table = ifelse.(ismissing.(contig_coverage_and_classification_table), "", contig_coverage_and_classification_table)

        uCSV.write(
            replace(nt_blast_report_path, r"\.txt$" => ".coverage-and-classification.tsv"),
            contig_coverage_and_classification_table,
            delim='\t'
        )
        # catch e
        #     println(e)
        #     println(SRR_path)
        # end
    end
end