In [None]:
# run(`conda create --channel conda-forge --channel bioconda --channel defaults --strict-channel-priority --name blast blast`)
# run(`conda create --channel conda-forge --channel bioconda --channel defaults --strict-channel-priority --name taxonkit taxonkit`)

In [None]:
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "uCSV",
    "StatsPlots",
    "StatsBase",
    "FreqTables",
    "Conda",
    "ProgressMeter",
    "PrettyTables"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
blast_task = "megablast"

In [None]:
db = "nt_viruses"

In [None]:
# load in metadata
metadata_dir = joinpath(dirname(pwd()), "metadata")

exposome_environmental_data = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

joint_sample_metadata = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "exposome/joint_sample_metadata.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

@assert joint_sample_metadata[!, "Library Name"] == joint_sample_metadata[!, "LibraryName"]

joint_metadata = DataFrames.innerjoin(
    joint_sample_metadata,
    exposome_environmental_data,
    on="Library Name" => "samplenames")

run_ids = sort(joint_metadata[!, "Run"])

sample_paths = joinpath.(data_dir, "SRA", run_ids)

In [None]:
# NCBI host metadata
ncbi_metadata_file = joinpath(dirname(pwd()), "metadata", "NCBI-virus-refseq.transformed.tsv")
ncbi_host_metadata = DataFrames.DataFrame(uCSV.read(ncbi_metadata_file, header=1, delim='\t', encodings=Dict("false" => false, "true" => true)))

# ICTV host metadata
ictv_metadata_file = joinpath(dirname(pwd()), "metadata", "VMR_MSL38_v1 - VMR MSL38 v1.transformed.tsv")
ictv_host_metadata = DataFrames.DataFrame(uCSV.read(ictv_metadata_file, header=1, delim='\t', typedetectrows=100))
ictv_host_metadata = ictv_host_metadata[.!isempty.(ictv_host_metadata[!, "taxid"]), :]
ictv_host_metadata[!, "taxid"] = parse.(Int, ictv_host_metadata[!, "taxid"])

viral_tax_ids = Set(Mycelia.list_subtaxa(10239))

In [None]:
joint_top_hits = DataFrames.DataFrame()
ProgressMeter.@showprogress for sample_path in sample_paths[1:end]
    sample = basename(sample_path)
    blastn_directory = mkpath(joinpath(sample_path, "blastn"))
    assembled_fasta = joinpath(sample_path, "megahit", "final.contigs.fastg.gfa.fna")
    blast_file = joinpath(blastn_directory, basename(assembled_fasta) * ".blastn.$(db).$(blast_task).txt")
    this_blast_table = Mycelia.parse_blast_report(blast_file)
    if isempty(this_blast_table)
        continue
    else
        this_blast_table[!, "sample_id"] .= sample
        # bonferonni correction on raw tests
        this_blast_table[!, "evalue"] = this_blast_table[!, "evalue"] .* DataFrames.nrow(this_blast_table)
        
        # filter to top hits to avoid ballooning memory just to throw it away later
        this_top_hits = DataFrames.DataFrame()
        for gdf in DataFrames.groupby(this_blast_table, "query id")
            push!(this_top_hits, first(sort(gdf, "bit score", rev=true)))
        end
        append!(joint_top_hits, this_top_hits)
    end
end

In [None]:
taxids = unique(joint_top_hits[!, "subject tax id"])
taxid2name_map = Dict(row["taxid"] => row["tax_name"] for row in DataFrames.eachrow(Mycelia.taxids2lineage_name_and_rank(taxids)))
joint_top_hits[!, "subject tax name"] = map(taxid -> taxid2name_map[taxid], joint_top_hits[!, "subject tax id"])

# filter to good hits even after bonferroni correction
joint_top_hits = joint_top_hits[joint_top_hits[!, "evalue"] .<= 0.001, :]

# filter to viral only
viral_hits_df = joint_top_hits[map(x -> x in viral_tax_ids, joint_top_hits[!, "subject tax id"]), :]

# current_host = "host_is_vertebrate"
# current_host = "host_is_mammal"
# current_host = "host_is_primate"
current_host = "host_is_human"
host_viral_tax_ids = Set(ncbi_host_metadata[ncbi_host_metadata[!, current_host] .== true, "taxid"])

# host_viral_taxids = ictv_host_metadata[map(x -> x in ["vertebrates", "invertebrates, vertebrates"], ictv_host_metadata[!, "Host source"]), "taxid"]
host_hits_df = viral_hits_df[map(x -> x in host_viral_tax_ids, viral_hits_df[!, "subject tax id"]), :]

In [None]:
ft = FreqTables.freqtable(host_hits_df, "sample_id", "subject tax name")

In [None]:
data = [collect(keys(ft.dicts[1])), [col for col in eachcol(ft.array)]...]
header = ["$(ft.dimnames[1]) \\ $(ft.dimnames[2])", collect(keys(ft.dicts[2]))...]
summary_table = DataFrames.DataFrame(data, header)

# summary_table

In [None]:
summary_table = DataFrames.innerjoin(summary_table, joint_metadata[!, ["Run", "aownership", "geo_loc_name", "date.end"]], on="sample_id \\ subject tax name" => "Run")

In [None]:
results_dir = joinpath(data_dir, "results")
# readdir(results_dir)

uCSV.write(joinpath(results_dir, "blast_hits_summary_table.csv"), summary_table)

In [None]:
results_dir

In [None]:
# m = "text/plain"
# m = "text/html"
# m =  "text/latex"
# m = "text/csv"
# m = "text/tab-separated-values"

# show(stdout, MIME(m), summary_table)

In [None]:
# show(stdout, MIME("text/html"), )
# PrettyTables.pretty_table(summary_table, backend = Val(:markdown))
# PrettyTables.pretty_table(summary_table, backend = Val(:latex))
# PrettyTables.pretty_table(summary_table, backend = Val(:html))
PrettyTables.pretty_table(summary_table, backend = Val(:text))

In [None]:
show(stdout, "text/plain", matrix)