In [None]:
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "StatsBase",
    "uCSV"
]
# Pkg.activate(; temp=true)
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
metadata_directory = joinpath(dirname(pwd()), "metadata")

In [None]:
exposome_metadata = DataFrames.DataFrame(
uCSV.read(
    joinpath(metadata_directory, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=200
))

In [None]:
for x in sort(unique(exposome_metadata[!, "aownership"]))
    println(x)
end

In [None]:
# NCBI host metadata
ncbi_metadata_file = joinpath(dirname(pwd()), "metadata", "NCBI-virus-refseq.transformed.tsv")
ncbi_host_metadata = DataFrames.DataFrame(uCSV.read(ncbi_metadata_file, header=1, delim='\t', encodings=Dict("false" => false, "true" => true)))

# ICTV host metadata
ictv_metadata_file = joinpath(dirname(pwd()), "metadata", "VMR_MSL38_v1 - VMR MSL38 v1.transformed.tsv")
ictv_host_metadata = DataFrames.DataFrame(uCSV.read(ictv_metadata_file, header=1, delim='\t', typedetectrows=100))
ictv_host_metadata = ictv_host_metadata[.!isempty.(ictv_host_metadata[!, "taxid"]), :]
ictv_host_metadata[!, "taxid"] = parse.(Int, ictv_host_metadata[!, "taxid"])

# # VirusHostDB metadata
virushostdb_metadata_file = joinpath(dirname(pwd()), "metadata", "virushostdb.transformed.tsv")
virushostdb_metadata = DataFrames.DataFrame(uCSV.read(virushostdb_metadata_file, header=1, delim='\t', typedetectrows=1086, encodings=Dict("missing" => missing, "false" => false, "true" => true)))

vertebrate_taxids = union(
    ictv_host_metadata[ictv_host_metadata[!, "Host source"] .== "vertebrates", "taxid"],
    ncbi_host_metadata[ncbi_host_metadata[!, "host_is_vertebrate"], "taxid"],
    virushostdb_metadata[virushostdb_metadata[!, "host_is_vertebrate"], "virus_taxid"]
)

human_taxids = union(
    ncbi_host_metadata[ncbi_host_metadata[!, "host_is_human"], "taxid"],
    virushostdb_metadata[virushostdb_metadata[!, "host_is_human"], "virus_taxid"]
)

In [None]:
base_dir = dirname(pwd())
data_dir = joinpath(base_dir, "data")
sample_directories = readdir(joinpath(data_dir, "SRA"), join=true)

In [None]:
easy_taxonomy_lca_reports = String[]
for sample_directory in sample_directories
    mmseqs_directory_contents = readdir(joinpath(sample_directory, "mmseqs_easy_taxonomy"), join=true)
    sample_lca_reports = filter(x -> occursin("final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy.", x) && occursin("_lca.tsv", x), mmseqs_directory_contents)
    append!(easy_taxonomy_lca_reports, sample_lca_reports)
end
easy_taxonomy_lca_reports

In [None]:
lca_tsv = first(easy_taxonomy_lca_reports)

In [None]:
joint_lca_table = DataFrames.DataFrame()
ProgressMeter.@showprogress for lca_tsv in easy_taxonomy_lca_reports
    method = replace(replace(basename(lca_tsv), "final.contigs.fastg.gfa.fna." => ""), "_lca.tsv" => "")
    # @show method
    this_lca_table = Mycelia.parse_mmseqs_easy_taxonomy_lca_tsv(lca_tsv)
    this_lca_table[!, "sample"] .= basename(dirname(dirname(lca_tsv)))
    this_lca_table[!, "method"] .= method
    append!(joint_lca_table, this_lca_table)
end
joint_lca_table

In [None]:
filtered_lca_table = joint_lca_table[map(x -> x in human_taxids, joint_lca_table[!, "taxon_id"]), :]

In [None]:
taxon_counts = collect(StatsBase.countmap(filtered_lca_table[!, "taxon_name"]))
taxon_counts = DataFrames.DataFrame(taxon_name = first.(taxon_counts), counts = last.(taxon_counts))
sort!(taxon_counts, "counts", rev=true)

In [None]:
mmseqs_taxid_hits = unique(filtered_lca_table[!, "taxon_id"])

In [None]:
kraken_taxid_hits = [
10566,
1175849,
2017081,
2025360,
2734294,
277944,
2955291,
2955746,
3050300,
3052399,
36427,
59304,
642022,
93678,
11137,
11676,
1239573,
1978540,
2304647,
290028,
3050298,
694009,
10243,
10245,
1513260,
1513263,
1961678,
3052317,
3052458,
3052499,
568715,
129875,
1678143,
1780507,
1891764,
1961681,
1987017,
291484,
1961680,
3048383,
1904876,
3050294,
1513256,
2200830,
393049,
129951,
130308,
2055263,
68887,
204269,
2049444,
2825847,
38170,
10244,
1962300,
944645,
1972576,
3048384,
45617,
2170195,
3050295,
1513257,
2025396,
3047956,
2844583,
1513258,
11983,
3050297,
3052413,
3050299,
10255,
2844646,
2021738,
2170197,
2956268,
1518574,
3050296,
130310,
1647924,
99000,
1511805,
10258,
3047957,
1891726,
38804,
108098,
1335626,
1922246,
2844585,
];

In [None]:
for x in unique(filtered_lca_table[map(x -> x in intersect(kraken_taxid_hits, mmseqs_taxid_hits), filtered_lca_table[!, "taxon_id"]), "taxon_name"])
    println(x)
end

In [None]:
Human endogenous retrovirus K
Human papillomavirus KC5
Human papillomavirus
Human immunodeficiency virus 1
Alphapolyomavirus quintihominis