## Objective

download all genomes in an NCBI accession list table
e.g. [this table](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=SARS-CoV-2,%20taxid:2697049)

In [1]:
# papermill parameters
base="$(homedir())/workspace"
project="sars-cov2-pangenome-analysis"

accession_table_file = "$(base)/$(project)/metadata/sequences.csv"
output_directory = "$(base)/$(project)/data/sequences"

"/home/jovyan/workspace/sars-cov2-pangenome-analysis/data/sequences"

## Materials, Methods, and Functions

In [2]:
mkpath(output_directory)

"/home/jovyan/workspace/sars-cov2-pangenome-analysis/data/sequences"

In [3]:
import Pkg
Pkg.update()

pkgs = [
    "Revise",
    "DataFrames",
    "uCSV",
    "ProgressMeter",
    "FASTX"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

# Pkg.add(url="https://github.com/cjprybol/Mycelia.git", rev="master")
import Mycelia

[32m[1m    Updating[22m[39m registry at `/opt/julia/registries/General`
[32m[1m   Installed[22m[39m Tables ─ v1.8.1
[32m[1m  No Changes[22m[39m to `~/work/Mycelia/Project.toml`
[32m[1m    Updating[22m[39m `~/work/Mycelia/Manifest.toml`
 [90m [bd369af6] [39m[93m↑ Tables v1.8.0 ⇒ v1.8.1[39m
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39m[90mTables[39m
[32m  ✓ [39m[90mTableOperations[39m
[32m  ✓ [39m[90mStructArrays[39m
[32m  ✓ [39m[90mStatsModels[39m
[32m  ✓ [39mGLM
[32m  ✓ [39m[90mPrettyTables[39m
[32m  ✓ [39m[90mGeometryBasics[39m
[32m  ✓ [39m[90mNetworkLayout[39m
[32m  ✓ [39mStatsPlots
[32m  ✓ [39mGraphRecipes
[32m  ✓ [39mDataFrames
[32m  ✓ [39muCSV
[32m  ✓ [39mGenomicAnnotations
[32m  ✓ [39mMycelia
  14 dependencies successfully precompiled in 50 seconds (219 already precompiled)


In [4]:
# @time accession_table = DataFrames.DataFrame(uCSV.read(accession_table_file, typedetectrows=100, header=1, quotes='"')...)

In [5]:
p = ProgressMeter.Progress(countlines(accession_table_file))

ProgressMeter.Progress(6043674, ReentrantLock(nothing, Base.GenericCondition{Base.Threads.SpinLock}(Base.InvasiveLinkedList{Task}(nothing, nothing), Base.Threads.SpinLock(0)), 0), 0.1, 0, 1.663438923729774e9, 1.663438923729774e9, 1.663438923729774e9, false, "Progress: ", nothing, ProgressMeter.BarGlyphs('|', '█', ['▏', '▎', '▍', '▌', '▋', '▊', '▉'], ' ', '|'), :green, IJulia.IJuliaStdio{Base.PipeEndpoint}(IOContext(Base.PipeEndpoint(RawFD(41) open, 0 bytes waiting))), 0, 0, 0, true, false, 1, 1, Int64[])

In [None]:
# sort files by reverse size
# this is helpful to track down incomplete sequences
# can also determine the size of each genome
# then determine if there is a good cutoff at which to just remove the incomplete ones
# ls -lShr

# 2_262_407

table_io = open(accession_table_file)
header = readline(table_io)
# ProgressMeter.@showprogress for accession in accession_table[!, "Accession"]
for line in eachline(table_io)
    accession = split(line, ',')[1]
    outfile = joinpath(output_directory, "$(accession).fna")
    if !isfile(outfile) || (filesize(outfile) == 0)
        fasta_records = collect(Mycelia.get_sequence(db="nuccore", accession=accession))
        open(outfile, "w") do io
            fastx_io = FASTX.FASTA.Writer(io)
            for record in fasta_records
                write(fastx_io, record)
            end
            close(fastx_io)
        end
    end
    ProgressMeter.next!(p)
end
table_io

[32mProgress:  58%|███████████████████████▋                 |  ETA: 14:59:24[39m02[39mm