## Objective

download all genomes in NCBI refseq or genbank at or below a specified taxon_id

## Materials, Methods, and Functions

In [None]:
# default parameters
@info "define_parameters"
taxon_id = 10239
data_dir = "../../data/genomes"
# database = refseq or genbank
ncbi_database = "refseq"

In [None]:
@info "initialize"
mkpath(data_dir)

In [None]:
@info "import libraries"
import Mycelia
import DataFrames
import ProgressMeter

In [None]:
@info "define functions"
function download_genome_from_ncbi_ftp_path(ftp_path)
    # https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/819/615/GCF_000819615.1_ViralProj14015/GCF_000819615.1_ViralProj14015_genomic.fna.gz
    fasta_filename = basename(ftp_path) * "_genomic.fna.gz"
    full_ftp_path = joinpath(ftp_path, fasta_filename)
    full_local_path = joinpath(data_dir, fasta_filename)
    uncompressed_local_path = replace(full_local_path, ".gz" => "")
    if !isfile(uncompressed_local_path)
        if !isfile(full_local_path)
            download(full_ftp_path, full_local_path)
        end
        run(`gzip -d $full_local_path`)
    end
end

In [None]:
@info "download ncbi metadata"
ncbi_metadata = Mycelia.load_ncbi_metadata(ncbi_database)

In [None]:
@info "get child taxon_ids"
child_taxon_ids = Set([parse(Int, line) for line in eachline(open(`taxonkit list --ids $(taxon_id) --indent ""`)) if !isempty(line)])
@assert taxon_id in child_taxon_ids

In [None]:
@info "filter ncbi metadata down to child taxon ids"
is_right_taxa = map(taxid -> taxid in child_taxon_ids, ncbi_metadata[!, "taxid"])
is_right_taxa .|= map(taxid -> taxid in child_taxon_ids, ncbi_metadata[!, "species_taxid"])
filtered_ncbi_metadata = ncbi_metadata[is_right_taxa, :]

In [None]:
@info "download genomes"
# ProgressMeter.@showprogress for ftp_path in filtered_ncbi_metadata[!, "ftp_path"]
ProgressMeter.@showprogress for ftp_path in filtered_ncbi_metadata[1:10, "ftp_path"]
    download_genome_from_ncbi_ftp_path(ftp_path)
end

## Experimental/Simulated Observations

N/A

## Analysis, Statistics, and Visualizations

N/A

## Summary of Results

N/A

## Conclusions and Future Directions

N/A