## Objective

download all genomes in NCBI refseq or genbank at or below a specified taxon_id

## Materials, Methods, and Functions

In [1]:
# default parameters
@info "define_parameters"
taxon_id = 10239
data_dir = "../../data"
# database = refseq or genbank
ncbi_database = "refseq"

┌ Info: define_parameters
└ @ Main In[1]:2


"refseq"

In [2]:
@info "initializing genomes directory"
genome_directory = mkpath("$data_dir/genomes")

┌ Info: initializing genomes directory
└ @ Main In[2]:1


"../../data/genomes"

In [3]:
@info "initializing joint fastas directory"
joint_fastas_directory = mkpath("$data_dir/joint-fastas")

┌ Info: initializing joint fastas directory
└ @ Main In[3]:1


"../../data/joint-fastas"

In [4]:
@info "import libraries"
import Mycelia
import DataFrames
import ProgressMeter

┌ Info: import libraries
└ @ Main In[4]:1


In [5]:
@info "define functions"

function filetype_to_ncbi_extension(filetype)
    d = Dict(
        "fasta" => "genomic.fna.gz",
        "gff" => "genomic.gff.gz"
    )
    try
        return d[filetype]
    catch
        key_options = join(keys(d), '\n')
        error("please choose one of the following\n$key_options")
    end
end

function download_filetype_from_ncbi_ftp_path(outdir, ftp_path, filetype)
    filename = basename(ftp_path) * "_" * filetype_to_ncbi_extension(filetype)
    full_ftp_path = joinpath(ftp_path, filename)
    full_local_path = joinpath(outdir, filename)
    uncompressed_local_path = replace(full_local_path, ".gz" => "")
    if !isfile(uncompressed_local_path)
        if !isfile(full_local_path)
            download(full_ftp_path, full_local_path)
        end
        run(`gzip -d $full_local_path`)
    end
    return uncompressed_local_path
end

┌ Info: define functions
└ @ Main In[5]:1


download_filetype_from_ncbi_ftp_path (generic function with 1 method)

In [6]:
@info "download ncbi metadata"
ncbi_metadata = Mycelia.load_ncbi_metadata(ncbi_database)

┌ Info: download ncbi metadata
└ @ Main In[6]:1


Unnamed: 0_level_0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category
Unnamed: 0_level_1,String,String,String,String,String
1,GCF_000001215.4,PRJNA164,SAMN02803731,,reference genome
2,GCF_000001405.40,PRJNA168,,,reference genome
3,GCF_000001635.27,PRJNA169,,,reference genome
4,GCF_000001735.4,PRJNA116,SAMN03081427,,reference genome
5,GCF_000001905.1,PRJNA70973,SAMN02953622,AAGU00000000.3,representative genome
6,GCF_000001985.1,PRJNA32665,SAMN02953685,ABAR00000000.1,representative genome
7,GCF_000002035.6,PRJNA13922,SAMN06930106,,reference genome
8,GCF_000002075.1,PRJNA209509,SAMN02953658,AASC00000000.3,representative genome
9,GCF_000002235.5,PRJNA13728,SAMN00829422,AAGJ00000000.6,representative genome
10,GCF_000002285.5,PRJNA12384,SAMN02953603,AAEX00000000.4,na


In [7]:
@info "get child taxon_ids"
child_taxon_ids = Set([parse(Int, line) for line in eachline(open(`taxonkit list --ids $(taxon_id) --indent ""`)) if !isempty(line)])
@assert taxon_id in child_taxon_ids

┌ Info: get child taxon_ids
└ @ Main In[7]:1


In [8]:
@info "filter ncbi metadata down to child taxon ids"
is_right_taxa = map(taxid -> taxid in child_taxon_ids, ncbi_metadata[!, "taxid"])
is_right_taxa .|= map(taxid -> taxid in child_taxon_ids, ncbi_metadata[!, "species_taxid"])
filtered_ncbi_metadata = ncbi_metadata[is_right_taxa, :]

┌ Info: filter ncbi metadata down to child taxon ids
└ @ Main In[8]:1


Unnamed: 0_level_0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid
Unnamed: 0_level_1,String,String,String,String,String,Int64
1,GCF_000819615.1,PRJNA485481,,,na,2886930
2,GCF_000820355.1,PRJNA485481,,,na,191289
3,GCF_000820495.2,PRJNA485481,,,na,518987
4,GCF_000836805.1,PRJNA485481,,,reference genome,90963
5,GCF_000836825.1,PRJNA485481,,,na,1891754
6,GCF_000836845.1,PRJNA485481,,,na,196399
7,GCF_000836865.1,PRJNA485481,,,na,10570
8,GCF_000836885.1,PRJNA485481,,,na,57579
9,GCF_000836905.1,PRJNA485481,,,na,2905681
10,GCF_000836925.1,PRJNA485481,,,na,10868


In [9]:
ftp_list = filtered_ncbi_metadata[!, "ftp_path"]
# ftp_list = filtered_ncbi_metadata[1:10, "ftp_path"]

11699-element Vector{String}:
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000819615.1_ViralProj14015"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 33 bytes ⋯ "820355.1_ViralMultiSegProj14361"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 33 bytes ⋯ "820495.2_ViralMultiSegProj14656"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836805.1_ViralProj14012"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836825.1_ViralProj14017"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836845.1_ViralProj14021"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836865.1_ViralProj14025"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836885.1_ViralProj14030"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836905.1_ViralProj14035"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836925.1_ViralProj14039"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836945.1_ViralProj14044"
 "https://ftp.ncbi.nlm.nih.gov/ge" ⋯ 25 bytes ⋯ "/GCF_000836965.1_Vi

In [10]:
@info "download genomes"
ProgressMeter.@showprogress for ftp_path in ftp_list
    outfile = download_filetype_from_ncbi_ftp_path(genome_directory, ftp_path, "fasta")
end

In [None]:
@info "merge genomes into pangenome fasta"
joint_fasta = "$joint_fastas_directory/$taxon_id.fasta"
write_io = open(joint_fasta, "w")
ProgressMeter.@showprogress for ftp_path in ftp_list
    outfile = download_filetype_from_ncbi_ftp_path(genome_directory, ftp_path, "fasta")
    write(write_io, read(outfile))
end
close(write_io)

## Experimental/Simulated Observations

N/A

## Analysis, Statistics, and Visualizations

N/A

## Summary of Results

N/A

## Conclusions and Future Directions

N/A