In [None]:
# conda create -n taxonkit -c bioconda taxonkit
# mkdir $HOME/.taxonkit
# cd $HOME/.taxonkit
# wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
# tar -xvzf taxdump.tar.gz

In [None]:
# default = 100
# ENV["DATAFRAMES_COLUMNS"] = 100
# default = 25
ENV["DATAFRAMES_ROWS"] = 3

import Pkg

pkgs = [
"ProgressMeter",
"uCSV",
"DataFrames",
"StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
base_directory = dirname(pwd())

In [None]:
metadata_directory = joinpath(base_directory, "metadata")

In [None]:
readdir(metadata_directory)

In [None]:
function list_subtaxa_ids(id)
    Set(parse.(Int, filter(!isempty, strip.(readlines(`conda run --live-stream -n taxonkit taxonkit list --ids $(id)`)))))
end

@time vertebrate_taxids = list_subtaxa_ids(7742)
@time mammal_taxids = list_subtaxa_ids(40674)
@time primate_taxids = list_subtaxa_ids(9443)
@time human_taxids = list_subtaxa_ids(9606)

ICTV is the highest level, good for getting top level virome picture

In [None]:
# Virus REFSEQ accession
ictv_vmr_metadata = DataFrames.DataFrame(
    uCSV.read(joinpath(metadata_directory, "VMR_MSL38_v1 - VMR MSL38 v1.tsv"), delim='\t', header=1)
)

# ictv_vmr_metadata = ictv_vmr_metadata[.!isempty.(ictv_vmr_metadata[!, "Virus REFSEQ accession"]), :]
# ictv_vmr_metadata = ictv_vmr_metadata[ictv_vmr_metadata[!, "Exemplar or additional isolate"] .== "E", :]
open("species-names.txt", "w") do io
    for x in ictv_vmr_metadata[!, "Species"]
        println(io, x)
    end
end
data, header = uCSV.read(open(pipeline(`cat species-names.txt`, `conda run --live-stream -n taxonkit taxonkit name2taxid`)), delim='\t', typedetectrows=1962)
name_taxid_map = DataFrames.DataFrame(data, ["Species", "taxid"])
name_taxid_map = unique(name_taxid_map)
ictv_vmr_metadata = DataFrames.innerjoin(ictv_vmr_metadata, name_taxid_map, on="Species")
# StatsBase.countmap(ictv_vmr_metadata[!, "Host source"])
uCSV.write(joinpath(metadata_directory, "VMR_MSL38_v1 - VMR MSL38 v1.transformed.tsv"), ictv_vmr_metadata, delim='\t')

In [None]:
virus_host_db = DataFrames.DataFrame(
    uCSV.read(
        joinpath(metadata_directory, "virushostdb.tsv"),
        delim='\t',
        header=1,
        typedetectrows=721,
        encodings=Dict("" => missing)
    )
)

virus_host_db_transformed = DataFrames.DataFrame()
for row in DataFrames.eachrow(virus_host_db)
    virus_taxid = row["virus tax id"]
    virus_name = row["virus name"]
    virus_lineage = row["virus lineage"]
    host_taxid = row["host tax id"]
    host_name = row["host name"]
    host_lineage = row["host lineage"]
    refseq_ids = string.(split(row["refseq id"], ", "))
    for refseq_id in refseq_ids
        push!(virus_host_db_transformed, (;virus_taxid, virus_name, virus_lineage, host_taxid, host_name, host_lineage, refseq_id), promote=true)
    end
end
virus_host_db_transformed = unique(virus_host_db_transformed)
virus_host_db_transformed[!, "host_is_vertebrate"] = map(x -> !ismissing(x) && (x in vertebrate_taxids), virus_host_db_transformed[!, "host_taxid"])
virus_host_db_transformed[!, "host_is_mammal"] = map(x -> !ismissing(x) && (x in mammal_taxids), virus_host_db_transformed[!, "host_taxid"])
virus_host_db_transformed[!, "host_is_primate"] = map(x -> !ismissing(x) && (x in primate_taxids), virus_host_db_transformed[!, "host_taxid"])
virus_host_db_transformed[!, "host_is_human"] = map(x -> !ismissing(x) && (x in human_taxids), virus_host_db_transformed[!, "host_taxid"])

@show sum(virus_host_db_transformed[!, "host_is_vertebrate"])
@show sum(virus_host_db_transformed[!, "host_is_mammal"])
@show sum(virus_host_db_transformed[!, "host_is_primate"])
@show sum(virus_host_db_transformed[!, "host_is_human"])
uCSV.write(joinpath(metadata_directory, "virushostdb.transformed.tsv"), virus_host_db_transformed, delim='\t')

In [None]:
ncbi_virus_refseq_metadata = DataFrames.DataFrame(
    uCSV.read(joinpath(metadata_directory, "NCBI-virus-refseq.csv"), quotes='"', header=1)
)

open("species-names.txt", "w") do io
    for x in ncbi_virus_refseq_metadata[!, "Species"]
        println(io, x)
    end
end
data, header = uCSV.read(open(pipeline(`cat species-names.txt`, `conda run --live-stream -n taxonkit taxonkit name2taxid`)), delim='\t', typedetectrows=1962)
name_taxid_map = DataFrames.DataFrame(data, ["Species", "taxid"])
name_taxid_map = unique(name_taxid_map)
ncbi_virus_refseq_metadata = DataFrames.innerjoin(ncbi_virus_refseq_metadata, name_taxid_map, on="Species")

open("species-names.txt", "w") do io
    for x in ncbi_virus_refseq_metadata[!, "Host"]
        println(io, x)
    end
end
data, header = uCSV.read(open(pipeline(`cat species-names.txt`, `conda run --live-stream -n taxonkit taxonkit name2taxid`)), delim='\t', typedetectrows=1962)
name_taxid_map = DataFrames.DataFrame(data, ["Host", "host_taxid"])
name_taxid_map = unique(name_taxid_map)
ncbi_virus_refseq_metadata = DataFrames.innerjoin(ncbi_virus_refseq_metadata, name_taxid_map, on="Host")
ncbi_virus_refseq_metadata = ncbi_virus_refseq_metadata[map(!isempty, ncbi_virus_refseq_metadata[!, "taxid"]), :]
ncbi_virus_refseq_metadata = ncbi_virus_refseq_metadata[map(!isempty, ncbi_virus_refseq_metadata[!, "host_taxid"]), :]

ncbi_virus_refseq_metadata[!, "taxid"] = parse.(Int, ncbi_virus_refseq_metadata[!, "taxid"])
ncbi_virus_refseq_metadata[!, "host_taxid"] = parse.(Int, ncbi_virus_refseq_metadata[!, "host_taxid"])

ncbi_virus_refseq_metadata[!, "host_is_vertebrate"] = map(x -> !ismissing(x) && (x in vertebrate_taxids), ncbi_virus_refseq_metadata[!, "host_taxid"])
ncbi_virus_refseq_metadata[!, "host_is_mammal"] = map(x -> !ismissing(x) && (x in mammal_taxids), ncbi_virus_refseq_metadata[!, "host_taxid"])
ncbi_virus_refseq_metadata[!, "host_is_primate"] = map(x -> !ismissing(x) && (x in primate_taxids), ncbi_virus_refseq_metadata[!, "host_taxid"])
ncbi_virus_refseq_metadata[!, "host_is_human"] = map(x -> !ismissing(x) && (x in human_taxids), ncbi_virus_refseq_metadata[!, "host_taxid"])

@show sum(ncbi_virus_refseq_metadata[!, "host_is_vertebrate"])
@show sum(ncbi_virus_refseq_metadata[!, "host_is_mammal"])
@show sum(ncbi_virus_refseq_metadata[!, "host_is_primate"])
@show sum(ncbi_virus_refseq_metadata[!, "host_is_human"])

uCSV.write(joinpath(metadata_directory, "NCBI-virus-refseq.transformed.tsv"), ncbi_virus_refseq_metadata, delim='\t')