In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""

import Pkg
# use temp or named environment to avoid package clashes across development projects
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = [
    "DataFrames",
    "CSV",
    "uCSV",
    "XLSX"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
metadata_dir = joinpath(PROJECT_BASEDIR, "metadata")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
readdir(data_dir)

In [None]:
VMR_MSL_file = joinpath(metadata_dir, "VMR_MSL39_v2.xlsx")

In [None]:
VMR_MSL_table = Mycelia.drop_empty_columns(DataFrames.DataFrame(XLSX.readtable(VMR_MSL_file, "VMR MSL39")))
VMR_MSL_table = DataFrames.select(VMR_MSL_table, filter(x -> !occursin(r"^sub"i, x), names(VMR_MSL_table)))
VMR_MSL_table = DataFrames.dropmissing(VMR_MSL_table)
is_examplar = map(x -> !ismissing(x) && x == "E", VMR_MSL_table[!, "Exemplar or additional isolate"])
in_refseq = map(x -> !ismissing(x) && !isempty(x) && !occursin(";", x), VMR_MSL_table[!, "Virus REFSEQ accession"])
refseq_exemplar_table = VMR_MSL_table[is_examplar .& in_refseq, :]

In [None]:
@assert issorted(VMR_MSL_table, ["Species Sort", "Isolate Sort"])

# 7
# unique(exemplar_table[!, "Realm"])

# 11
# unique(exemplar_table[!, "Kingdom"])

# 19
# unique(exemplar_table[!, "Phylum"])

one_per_realm_df = DataFrames.DataFrame()
for gdf in DataFrames.groupby(refseq_exemplar_table, "Realm")
    @assert issorted(gdf, ["Species Sort", "Isolate Sort"])
    push!(one_per_realm_df, gdf[1, :], promote=true)
end
one_per_realm_df

In [None]:
for row in DataFrames.eachrow(one_per_realm_df)
    outfile = Mycelia.download_genome_by_accession(accession = row["Virus REFSEQ accession"], outdir = genome_dir, compressed=false)
end