In [None]:
import Pkg
Pkg.activate(".")

pkgs = [
"DataFrames",
"uCSV",
"ProgressMeter",
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
function parse_blast_report(blast_report)
    # example header line 
    # "# Fields: query id, subject id, subject acc., subject acc.ver, subject title, query length, subject length, q. start, q. end, s. start, s. end, evalue, bit score, score, alignment length, % identity, identical, mismatches, subject tax id"
    header_lines = collect(Iterators.filter(x -> occursin(r"# Fields:", x), eachline(blast_report)))
    if isempty(header_lines)
        @info "no hits found, returning empty table"
        return DataFrames.DataFrame()
    end
    header_line = first(header_lines)
    header = split(last(split(header_line, ": ")), ", ")
    blast_col_types = Dict(
        "query id" => String,
        "query title" => String,
        "subject id" => String,
        "subject gi" => String,
        "subject acc." => String,
        "subject acc.ver" => String,
        "subject title" => String,
        "query length" => Int,
        "subject length" => Int,
        "q. start" => Int,
        "q. end" => Int,
        "s. start" => Int,
        "s. end" => Int,
        "evalue" => Float64,
        "bit score" => Float64,
        "score" => Float64,
        "alignment length" => Int,
        "% identity" => Float64,
        "identical" => Int,
        "mismatches" => Int,
        "subject tax id" => Int,
        "subject sci name" => String,
        "subject com names" => String,
        "subject blast name" => String,
        "subject super kingdom" => String,
        "subject tax ids" => String,
        "subject sci names" => String,
        "subject com names" => String,
        "subject blast names" => String,
        "subject super kingdoms" => String,
        "subject title" => String,
        "subject titles" => String
    )
    data, _ = uCSV.read(
        blast_report,
        delim='\t',
        comment='#',
        # skipmalformed=true,
        allowmissing=true,
        encodings=Dict("N/A" => missing),
        types=[blast_col_types[h] for h in header])
    return DataFrames.DataFrame(data, header, makeunique=true)
end

In [None]:
viral_tax_ids = Set(parse.(Int, filter(!isempty, readlines(`conda run -n taxonkit taxonkit list --ids 10239 --indent ""`))))

In [None]:
basedir = "/labs/mpsnyder/share/exposome_data"

In [None]:
readdir(basedir)

In [None]:
batch_directory = joinpath(basedir, "Expo_pliot1_extracted_hifi_fastqs")
# batch_directory = joinpath(basedir, "extracted1")

In [None]:
# megablast_results = filter(x -> occursin(".megablast.nt.txt", x), readdir(batch_directory, join=true))
megablast_results = filter(x -> occursin(".blastn.nt.txt", x), readdir(batch_directory, join=true))

In [None]:
# 5 has crass phage
# Citrus necrotic spot virus strain Jal-1
# Faecal-associated gemycircularvirus 4
# megablast_result = megablast_results[8]

In [None]:
joint_blast_results = DataFrames.DataFrame()
ProgressMeter.@showprogress for megablast_result in megablast_results
    blast_table = parse_blast_report(megablast_result)
    if DataFrames.nrow(blast_table) > 0
        if "subject tax id" in names(blast_table)
            blast_table[!, "subject tax id"] = map(x -> ismissing(x) ? 0 : x, blast_table[!, "subject tax id"])
        else
            blast_table[!, "subject tax id"] = map(x -> (ismissing(x) || isempty(x)) ? 0 : parse(Int, first(split(x, ';'))), blast_table[!, "subject tax ids"])
        end
        blast_table = blast_table[map(x -> x in viral_tax_ids, blast_table[!, "subject tax id"]), :]
        append!(joint_blast_results, blast_table)
    end
end

In [None]:
unique(joint_blast_results[!, ["subject title", "subject tax id"]])

In [None]:
# joint_blast_results[map(x -> !(x in [1391026, 1354727, 1391033]), joint_blast_results[!, "subject tax id"]), :]

In [None]:
# joint_blast_results

In [None]:
blast_table = blast_table[map(x -> !(x in [1391026, 1354727]), blast_table[!, "subject tax id"]), :]

In [None]:
# function parse_blast_report(blast_report, top_n = 1)
#     top_n_contig_hits = Dict{String, Vector{String}}()
#     # example header line 
#     # "# Fields: query id, subject id, subject acc., subject acc.ver, subject title, query length, subject length, q. start, q. end, s. start, s. end, evalue, bit score, score, alignment length, % identity, identical, mismatches, subject tax id"
#     header_line = first(Iterators.filter(x -> occursin(r"# Fields:", x), eachline(blast_report)))
#     header = split(last(split(header_line, ": ")), ", ")
#     blast_col_types = Dict(
#         "query id" => String,
#         "query title" => String,
#         "subject id" => String,
#         "subject gi" => String,
#         "subject acc." => String,
#         "subject acc.ver" => String,
#         "subject title" => String,
#         "query length" => Int,
#         "subject length" => Int,
#         "q. start" => Int,
#         "q. end" => Int,
#         "s. start" => Int,
#         "s. end" => Int,
#         "evalue" => Float64,
#         "bit score" => Float64,
#         "score" => Float64,
#         "alignment length" => Int,
#         "% identity" => Float64,
#         "identical" => Int,
#         "mismatches" => Int,
#         "subject tax id" => Int,
#         "subject sci name" => String,
#         "subject com names" => String,
#         "subject blast name" => String,
#         "subject super kingdom" => String,
#         "subject tax ids" => String,
#         "subject sci names" => String,
#         "subject com names" => String,
#         "subject blast names" => String,
#         "subject super kingdoms" => String,
#         "subject title" => String,
#         "subject titles" => String
#     )
#     for line in Iterators.filter(x -> !occursin(r"^#", x), eachline(blast_report))
#         contig = first(split(line, '\t'))
#         if !haskey(top_n_contig_hits, contig)
#             top_n_contig_hits[contig] = [line]
#         elseif length(top_n_contig_hits[contig]) < top_n
#             push!(top_n_contig_hits[contig], line)
#         end
#     end
#     reconstructed_file = join([join(value, '\n') for value in values(top_n_contig_hits)], '\n')
#     data, _ = uCSV.read(
#         IOBuffer(reconstructed_file),
#         delim='\t',
#         # comment='#',
#         # skipmalformed=true,
#         allowmissing=true,
#         encodings=Dict("N/A" => missing),
#         types=[blast_col_types[h] for h in header])
#     return DataFrames.DataFrame(data, header, makeunique=true)
# end

In [None]:
# ProgressMeter.@showprogress for SRR_path in SRR_paths
#     # SRR_path = first(SRR_paths)
#     qualimap_coverage_table = parse_qualimap_contig_coverage(joinpath(SRR_path, "megahit", "qualimap", "genome_results.txt"))
#     mmseqs_lca_files = filter(x -> occursin("_lca.tsv", x) && occursin("final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy", x), readdir(joinpath(SRR_path, "mmseqs_easy_taxonomy"), join=true))

#     # mmseqs_lca_file = first(mmseqs_lca_files)
#     for mmseqs_lca_file in mmseqs_lca_files

#         parse_mmseqs_easy_taxonomy_lca_tsv(mmseqs_lca_file)
#         lca_table = parse_mmseqs_easy_taxonomy_lca_tsv(mmseqs_lca_file)
#         contig_coverage_and_classification_table = DataFrames.leftjoin(qualimap_coverage_table, lca_table, on="Contig" => "contig_id")
#         contig_coverage_and_classification_table[!, "taxon_id"] = map(x -> ismissing(x) ? 0 : x, contig_coverage_and_classification_table[!, "taxon_id"])
#         contig_coverage_and_classification_table = ifelse.(ismissing.(contig_coverage_and_classification_table), "", contig_coverage_and_classification_table)

#         uCSV.write(
#             replace(mmseqs_lca_file, r"\.tsv$" => ".coverage-and-classification.tsv"),
#             contig_coverage_and_classification_table,
#             delim='\t'
#         )
#     end
# end