## Objective

download all genomes in NCBI refseq or genbank at or below a specified taxon_id

## Materials, Methods, and Functions

In [None]:
# default parameters
@info "define_parameters"
taxon_id = 10239
data_dir = "../../data"
# database = refseq or genbank
ncbi_database = "refseq"

In [None]:
@info "define functions"

# # normally, mkpath shouldn't error if it already exists, but something funny is going on with the fact that this is mounted with rclone
# function my_mkpath(dir)
#     if !isdir(dir)
#         mkpath(dir)
#     end
#     return dir
# end

function filetype_to_ncbi_extension(filetype)
    d = Dict(
        "fasta" => "genomic.fna.gz",
        "gff" => "genomic.gff.gz"
    )
    try
        return d[filetype]
    catch
        key_options = join(keys(d), '\n')
        error("please choose one of the following\n$key_options")
    end
end

function download_filetype_from_ncbi_ftp_path(outdir, ftp_path, filetype)
    filename = basename(ftp_path) * "_" * filetype_to_ncbi_extension(filetype)
    full_ftp_path = joinpath(ftp_path, filename)
    full_local_path = joinpath(outdir, filename)
    if !isfile(full_local_path)
        download(full_ftp_path, full_local_path)
    end
    return full_local_path
end

In [None]:
function bgzip_recompress(infile, outfile=replace(infile, r"\.gz$" => ".bgz"))
    @assert occursin(r"\.gz$", infile) "file does not end with .gz, skipping..."
    @assert isfile(file) "file $infile not found, skipping..."
    p = pipeline(pipeline(`gzip -dc $file`, `bgzip`), outfile)
    run(p)
    return outfile
end

In [None]:
@info "initializing genomes directory"
genome_directory = mkpath("$data_dir/genomes")

In [None]:
@info "initializing joint fastas directory"
joint_fastas_directory = mkpath("$data_dir/joint-fastas")

In [None]:
# import Pkg
# Pkg.build("Mycelia")

In [None]:
@info "import libraries"
import Pkg
pkgs = [
"DataFrames",
"ProgressMeter",
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
@info "download ncbi metadata"
ncbi_metadata = Mycelia.load_ncbi_metadata(ncbi_database)

In [None]:
@info "get child taxon_ids"
child_taxon_ids = Set([parse(Int, line) for line in eachline(open(`taxonkit list --ids $(taxon_id) --indent ""`)) if !isempty(line)])
@assert taxon_id in child_taxon_ids

In [None]:
@info "filter ncbi metadata down to child taxon ids"
is_right_taxa = map(taxid -> taxid in child_taxon_ids, ncbi_metadata[!, "taxid"])
is_right_taxa .|= map(taxid -> taxid in child_taxon_ids, ncbi_metadata[!, "species_taxid"])
filtered_ncbi_metadata = ncbi_metadata[is_right_taxa, :]

In [None]:
ftp_list = filtered_ncbi_metadata[!, "ftp_path"]
# ftp_list = filtered_ncbi_metadata[1:10, "ftp_path"]

In [10]:
@info "download genomes"
ProgressMeter.@showprogress for ftp_path in ftp_list
    outfile = download_filetype_from_ncbi_ftp_path(genome_directory, ftp_path, "fasta")
end

In [None]:
@info "merge genomes into pangenome fasta"
joint_fasta = "$joint_fastas_directory/$taxon_id.fasta"
write_io = open(joint_fasta, "w")
ProgressMeter.@showprogress for ftp_path in ftp_list
    outfile = download_filetype_from_ncbi_ftp_path(genome_directory, ftp_path, "fasta")
    write(write_io, read(outfile))
end
close(write_io)

## Experimental/Simulated Observations

N/A

## Analysis, Statistics, and Visualizations

N/A

## Summary of Results

N/A

## Conclusions and Future Directions

N/A