In [None]:
import Pkg
Pkg.activate(".")

pkgs = [
"Revise",
"DataFrames",
"ProgressMeter",
"Statistics",
"uCSV",
"StatsBase",
"StatsPlots",
"FASTX",
"Kmers"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
RESULTS_DIR = mkpath(joinpath(dirname(pwd()), "results"))

In [None]:
# get all taxonids at or below virus
# mamba create -n taxonkit -c bioconda taxonkit
# wget -c ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz 
# tar -zxvf taxdump.tar.gz
# mkdir -p $HOME/.taxonkit
# cp names.dmp nodes.dmp delnodes.dmp merged.dmp $HOME/.taxonkit
# --data-dir

viral_tax_ids = Set(parse.(Int, filter(!isempty, readlines(`mamba run -n taxonkit taxonkit list --ids 10239 --indent ""`))))

In [None]:
n_methods = 8

In [None]:
# uniref_dbs = [
#     "UniRef100",
#     "UniRef90",
#     "UniRef50"
# ]
data_dir = joinpath(dirname(pwd()), "data")
SRR_paths = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(joinpath(data_dir, "SRA"), join=true))

In [None]:
sample_summary_table = DataFrames.DataFrame(
    # sample_id = String[],
    # total_mapped_bases = Int[],
    # total_contigs = Int[],
    # total_assembly_size = Int[],
    # percent_contigs_viral = Float64[],
    # percent_mapped_bases_viral = Float64[]
)

ProgressMeter.@showprogress for SRR_path in SRR_paths

    contig_info_pattern = basename(SRR_path) * ".final.contigs.fastg.gfa.fna"
    contig_info_files = filter(x -> occursin(contig_info_pattern, x), readdir(SRR_path, join=true))
    joint_contig_info_table = DataFrames.DataFrame()
    for f in contig_info_files
        table_col_types = [
            Int64,
            Int64,
            Int64,
            Float64,
            Float64,
            Float64,
            String,
            String,
            String,
            Int64,
            String,
            Float64,
            Float64,
            Float64,
            Float64,
            Int64,
            Int64,
            Int64,
            Int64,
            Int64,
            Int64,
            Int64,
            Int64,
            Int64,
        ]
        method = replace(replace(basename(f), basename(SRR_path) * ".final.contigs.fastg.gfa.fna." => ""), ".contig_info.tsv" => "")
        this_contig_info_table = DataFrames.DataFrame(uCSV.read(f, delim='\t', header=1, types=table_col_types, encodings=Dict("" => missing), allowmissing=true)...)
        this_contig_info_table[!, "Method"] .= method
        this_contig_info_table[!, "SRR"] .= basename(SRR_path)
        append!(joint_contig_info_table, this_contig_info_table)
    end
    sort!(joint_contig_info_table, "Contig")
    contig_info_table = unique(joint_contig_info_table[!, ["SRR", "Contig", "Length", "Mapped bases", "Mean coverage", "Standard Deviation", "% Mapped bases"]])

    blast_classifications_table = joint_contig_info_table[.!ismissing.(joint_contig_info_table[!, "subject id"]), DataFrames.Not(names(contig_info_table[!, DataFrames.Not("Contig")]))]
    sort!(blast_classifications_table, ["Contig", "evalue"])
    blast_hits_top_hits_table = DataFrames.combine(DataFrames.groupby(blast_classifications_table, ["Contig", "Method"]), first)
    blast_hits_top_hits_table = blast_hits_top_hits_table[map(x -> x in viral_tax_ids, blast_hits_top_hits_table[!, "subject tax id"]), :]

    joint_lca_table = DataFrames.DataFrame()
    easy_taxonomy_lca_reports = filter(x -> occursin("final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy.", x) && occursin("_lca.tsv", x), readdir(joinpath(SRR_path, "mmseqs_easy_taxonomy"), join=true))
    for lca_tsv in easy_taxonomy_lca_reports
        method = replace(replace(basename(lca_tsv), "final.contigs.fastg.gfa.fna." => ""), "_lca.tsv" => "")
        this_lca_table = Mycelia.parse_mmseqs_easy_taxonomy_lca_tsv(lca_tsv)
        this_lca_table[!, "SRR"] .= basename(SRR_path)
        this_lca_table[!, "SRR"] .= method
        append!(joint_lca_table, this_lca_table)
    end
    sort!(joint_lca_table, "contig_id")
    joint_lca_table = joint_lca_table[map(x -> x in viral_tax_ids, joint_lca_table[!, "taxon_id"]), :]

    virsorter_score_tsv = joinpath(SRR_path, "virsorter", "final-viral-score.tsv")
    virsorter_results = Mycelia.parse_virsorter_score_tsv(virsorter_score_tsv)
    virsorter_results[!, "seqname"] = parse.(Int, first.(split.(virsorter_results[!, "seqname"], '|')))
    # virsorter_results

    genomad_virus_summary = joinpath(SRR_path, "genomad", "final.contigs.fastg.gfa_summary", "final.contigs.fastg.gfa_virus_summary.tsv")
    genomad_results = DataFrames.DataFrame(uCSV.read(genomad_virus_summary, delim='\t', header=1, typedetectrows=100)...)

    number_of_hits = vcat(
        blast_hits_top_hits_table[!, "Contig"],
        joint_lca_table[!, "contig_id"],
        virsorter_results[!, "seqname"],
        genomad_results[!, "seq_name"]
    )

    contig_support_counts = StatsBase.countmap(number_of_hits)
    
    majority_support_contigs = Set(keys(filter(x -> x[2] >= (n_methods/2), contig_support_counts)))
    fna_file = joinpath(SRR_path, "megahit", "final.contigs.fastg.gfa.fna")
    fna_records = collect(Mycelia.open_fastx(fna_file))
    viral_records = filter(x -> parse(Int, FASTX.identifier(x)) in majority_support_contigs, fna_records)

    number_of_viral_contigs = length(viral_records)
    viral_assembly_size = sum(length.(FASTX.sequence.(viral_records)))

    # viral_7mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{7}, viral_records))
    viral_11mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{11}, viral_records))
    viral_13mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{13}, viral_records))
    viral_17mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{17}, viral_records))
    
    # all_7mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{7}, fna_records))
    all_11mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{11}, fna_records))
    all_13mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{13}, fna_records))
    all_17mers = length(Mycelia.count_canonical_kmers(Kmers.DNAKmer{17}, fna_records))

    contig_info_table[!, "viral_classification_count"] = map(contig -> get(contig_support_counts, contig, 0), contig_info_table[!, "Contig"])

    contig_info_table[!, "viral_classification_percent"] = round.((contig_info_table[!, "viral_classification_count"] ./ n_methods) .* 100, digits=1)
    contig_info_table

    sample_summary_row = (
        sample_id = contig_info_table[1, "SRR"],
        total_mapped_bases = sum(contig_info_table[!, "Mapped bases"]),
        total_contigs = DataFrames.nrow(contig_info_table),
        total_assembly_size = sum(contig_info_table[!, "Length"]),
        percent_contigs_viral = round(count(contig_info_table[!, "viral_classification_percent"] .>= 50) / DataFrames.nrow(contig_info_table) * 100, digits=3),
        percent_mapped_bases_viral = sum(contig_info_table[contig_info_table[!, "viral_classification_percent"] .>= 50, "% Mapped bases"]),
        number_of_viral_contigs = number_of_viral_contigs,
        viral_assembly_size = viral_assembly_size,
        # viral_7mers = viral_7mers,
        viral_11mers = viral_11mers,
        viral_13mers = viral_13mers,
        viral_17mers = viral_17mers,
        # all_7mers = all_7mers,
        all_11mers = all_11mers,
        all_13mers = all_13mers,
        all_17mers = all_17mers
    )
    push!(sample_summary_table, sample_summary_row)
end
sample_summary_table

In [None]:
uCSV.write("$(RESULTS_DIR)/sample_viral_summary_stats_table.tsv", sample_summary_table, delim='\t')

In [None]:
DataFrames.DataFrame(uCSV.read("$(RESULTS_DIR)/sample_viral_summary_stats_table.tsv", header=1, delim='\t')...)

In [None]:
# contamination_load = sample_summary_table[findfirst(sample_summary_table[!, "sample_id"] .== "SRR6399584"), "total_mapped_bases"]

In [None]:
# mean_mapped_bases = Statistics.mean(sample_summary_table[!, "total_mapped_bases"])
# std_mapped_bases = Statistics.std(sample_summary_table[!, "total_mapped_bases"])

# p = StatsPlots.histogram(
#     sample_summary_table[!, "total_mapped_bases"],
#     ylabel = "# of samples",
#     xlabel = "# of mapped bases",
#     label=""
# )
# # p = StatsPlots.vline!(
# #     [mean_mapped_bases - 2.5std_mapped_bases],
# #     label="-2.5 standard deviations"
# # )

# p = StatsPlots.vline!(
#         [sample_summary_table[findfirst(sample_summary_table[!, "sample_id"] .== "SRR6399584"), "total_mapped_bases"]],
#         label="negative control"
#     )

In [None]:
# first list

# 1	SRR6399584	chaoexpo26_I6
# 2	SRR6399726	chaoexpo62_I6
# 3	SRR6399596	chaoexpo36_I15
# 4	SRR7365476	chaoexpo27_I5
# 5	SRR6399525	chaoexpo67_I12
# 6	SRR6399862	chaoexpo17_I15
# 7	SRR6399652	xinexpo76_I5

In [None]:
# sort!(sample_summary_table, "percent_mapped_bases_viral", rev=true)[1:10, :]

In [None]:
# a = sort(sample_summary_table, "total_mapped_bases_viral", rev=true)[1:10, "sample_id"]
# b = sort(sample_summary_table, "percent_mapped_bases_viral", rev=true)[1:10, "sample_id"]
# ab = union(a, b)
# # for x in union(a, b)
# #     println(x)
# # end

In [None]:
# sample_summary_table[, "total_mapped_bases"] .>= contamination_load
# sample_summary_table[



In [None]:
# metadata_file = joinpath(dirname(pwd()), "metadata", "exposome", "SraRunInfo.csv")
# metadata_table = DataFrames.DataFrame(uCSV.read(metadata_file, header=1)...)

In [None]:
# for x in metadata_table[map(x -> x in ab, metadata_table[!, "Run"]), "LibraryName"]
#     println(x)
# end

In [None]:
# is_in_top_set = map(x -> x in ab, sample_summary_table[!, "sample_id"])
# is_above_contamination_load = sample_summary_table[!, "total_mapped_bases"] .>= contamination_load
# above_contamination = sample_summary_table[is_in_top_set .& is_above_contamination_load, "sample_id"]

In [None]:
# for x in metadata_table[map(x -> x in above_contamination, metadata_table[!, "Run"]), "LibraryName"]
#     println(x)
# end

In [None]:


# joint_summary_table = DataFrames.innerjoin(summary_table, metadata_table, on="sample_identifier" => "Run")

In [None]:
# intersect list above with library prep dataframe

In [None]:
# sample_summary_table[!, "total_mapped_bases_viral"] = sample_summary_table[!, "total_mapped_bases"] .* sample_summary_table[!, "percent_mapped_bases_viral"]

In [None]:
# findfirst(sample_summary_table[!, "sample_id"] .== "SRR6399584") / DataFrames.nrow(sample_summary_table)

In [None]:
# negative_control = sample_summary_table[findfirst(sample_summary_table[!, "sample_id"] .== "SRR6399584"), "total_mapped_bases"]

In [None]:
# negative_control = 6080239502

In [None]:
# sample_summary_table[sample_summary_table[!, "total_mapped_bases"] .<= (mean_mapped_bases - 2std_mapped_bases), :]

In [None]:
# sample_summary_table[sample_summary_table[!, "total_mapped_bases"] .>= (mean_mapped_bases - 2.5std_mapped_bases), :]

In [None]:
# sort(sample_summary_table, "percent_contigs_viral", rev=true)

In [None]:
# DataFrames.groupby(sort!(joint_contig_info_table[.!isempty.(joint_contig_info_table[!, "subject id"]), :], "evalue"), "Method")

In [None]:
# DataFrames.sort!(DataFrames.groupby(joint_contig_info_table, "Method"), "")

In [None]:
# take genomad and virsorter results as binary

In [None]:
# make 

In [None]:
# tophit_report = "/global/cfs/cdirs/m4269/cjprybol/Mycelia/projects/viral-pangenome-discovery/data/SRA/SRR6399459/mmseqs_easy_taxonomy/final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy.UniRef100_tophit_report"
# Mycelia.parse_mmseqs_easy_taxonomy_tophit_report(tophit_report)

In [None]:
# tophit_aln = "/global/cfs/cdirs/m4269/cjprybol/Mycelia/projects/viral-pangenome-discovery/data/SRA/SRR6399459/mmseqs_easy_taxonomy/final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy.UniRef100_tophit_aln"
# Mycelia.parse_mmseqs_tophit_aln(tophit_aln)

In [None]:
# report - this is tree-based relative frequencies - make krona plots from these!

In [None]:
# load in other classification approaches

In [None]:
# get number of contigs from each sample

In [None]:
# viral_frequency_table = DataFrames.DataFrame(
#     sample_identifier = String[],
#     classification_method = String[],
#     percent_viral_contigs = Float64[]
# )

# ProgressMeter.@showprogress for SRR_path in SRR_paths
#     mmseqs_dir = joinpath(SRR_path, "mmseqs_easy_taxonomy")
#     for db in uniref_dbs
#         kraken_report = "$(mmseqs_dir)/final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy.$(db)_report"
#         # add something to determine everything below viruses with phylokit or whatever it's called
#         viral_lines = collect(Iterators.filter(x -> occursin(r"virus"i, x) && occursin(r"10239"i, x), eachline(kraken_report)))
#         if length(viral_lines) == 0
#             percent_viral_contigs = 0.0
#         elseif length(viral_lines) == 1
#             viral_line_table = Mycelia.read_kraken_report(IOBuffer(join(viral_lines, '\n')))
#             percent_viral_contigs = viral_line_table[1, "percentage_of_fragments_at_or_below_taxon"]
#         else
#             display(viral_lines)
#             error()
#         end
#         row = (
#             sample_identifier = basename(SRR_path),
#             classification_method = db,
#             percent_viral_contigs = percent_viral_contigs
#         )
#         push!(viral_frequency_table, row)
#     end
# end
# viral_frequency_table

In [None]:
# readdir()

In [None]:
# summary_table = DataFrames.combine(DataFrames.groupby(viral_frequency_table, "sample_identifier"), "percent_viral_contigs" => Statistics.median)
# metadata_file = joinpath(dirname(pwd()), "metadata", "exposome", "SraRunInfo.csv")
# metadata_table = DataFrames.DataFrame(uCSV.read(metadata_file, header=1)...)
# joint_summary_table = DataFrames.innerjoin(summary_table, metadata_table, on="sample_identifier" => "Run")
# joint_summary_table = sort(joint_summary_file, "percent_viral_contigs_median", rev=true)

In [None]:
# joint_summary_table[joint_summary_table[!, "percent_viral_contigs_median"] .>= 1, ["sample_identifier", "LibraryName"]]

In [None]:
# SRR_path = first(SRR_paths)

In [None]:
# make joint_contig_info reports

In [None]:
# readdir(SRR_path)

In [None]:
# # "genomad"
# readdir(joinpath(SRR_path, "genomad", "final.contigs.fastg.gfa_summary"))
# # "final.contigs.fastg.gfa_virus_genes.tsv"
# # "final.contigs.fastg.gfa_virus_proteins.faa"

# # "final.contigs.fastg.gfa_virus.fna"
# # "final.contigs.fastg.gfa_virus_summary.tsv"

In [None]:
# hit vs no hit
# "blastn"

In [None]:
# "virsorter"
# # final-viral-score.tsv - 1 line per viral contig
# # final-viral-combined.fa - viral fasta records

In [None]:
# readdir(SRR_path)

In [None]:
# summary_table = DataFrames.combine(DataFrames.groupby(viral_frequency_table, "sample_identifier"), "percent_viral_contigs" => Statistics.median)

In [None]:

# for SRR_path in SRR_paths
#     SRR = basename(SRR_path)
#     assembled_fasta = joinpath(SRR_path, "megahit", "final.contigs.fastg.gfa.fna")
#     out_dir = mkpath(joinpath(SRR_path, "mmseqs_easy_taxonomy"))
#     outfile_base = joinpath(out_dir, basename(assembled_fasta) * ".mmseqs_easy_taxonomy." * basename(target_database))
#     outfiles = [outfile_base * ext for ext in ["_lca.tsv", "_report", "_tophit_aln", "_tophit_report"]]
#     if !all(isfile, outfiles)
#         println("need to run $target_database for $SRR")
#         # run(`sbatch mmseqs.sh $(assembled_fasta) $(target_database) $(outfile_base) $(joinpath(out_dir, "tmp"))`)
#         # println(`sbatch mmseqs.sh $(assembled_fasta) $(target_database) $(outfile_base) $(joinpath(out_dir, "tmp"))`)
#         # 73 minutes UniRef100
#         # 35 minutes UniRef90
#         # 10 minutes UniRef50
#         # println("conda run --no-capture-output -n mmseqs2 mmseqs easy-taxonomy $(assembled_fasta) $(target_database) $(outfile_base) $(joinpath(out_dir, "tmp"))")
#         # Mycelia.run_mmseqs_easy_taxonomy(out_dir=SRR_path, query_fasta=assembled_fasta, target_database=target_database, outfile=SRR)
#     else
#         println("all outfiles already exists for $target_database and $SRR...")
#     end
# end

In [None]:
# python $HOME/workspace/Mycelia/projects/viral-pangenome-discovery/notebooks/kreport2krona.py -r $HOME/workspace/Mycelia/projects/viral-pangenome-discovery/data/SRA/SRR6399459/mmseqs_easy_taxonomy_default/SRR6399459.mmseqs_easy_taxonomy.UniRef100.txt_report -o $HOME/workspace/Mycelia/projects/viral-pangenome-discovery/data/SRA/SRR6399459/mmseqs_easy_taxonomy_default/SRR6399459.mmseqs_easy_taxonomy.UniRef100.txt_report.krona
# mamba install -c bioconda krona
# ktUpdateTaxonomy.sh
# ktImportText $HOME/workspace/Mycelia/projects/viral-pangenome-discovery/data/SRA/SRR6399459/mmseqs_easy_taxonomy_default/SRR6399459.mmseqs_easy_taxonomy.UniRef100.txt_report.krona -o $HOME/workspace/Mycelia/projects/viral-pangenome-discovery/data/SRA/SRR6399459/mmseqs_easy_taxonomy_default/SRR6399459.mmseqs_easy_taxonomy.UniRef100.txt_report.krona.html
# if !isfile(krona_file)
#     run(`python kreport2krona.py -r $(report) -o $(krona_file)`)
# end
# if !isfile(krona_html)
#     run(`ktImportText $(krona_file) -o $(krona_html)`)
# end