In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""

import Pkg
# use temp or named environment to avoid package clashes across development projects
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = [
    "DataFrames"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
PROJECT_BASEDIR = dirname(pwd())
ncbi_metadata = Mycelia.load_refseq_metadata()

filtered_ncbi_metadata = 
ncbi_metadata[
    (ncbi_metadata[!, "excluded_from_refseq"] .== "na") .&
    (ncbi_metadata[!, "genome_rep"] .== "Full") .&
    map(x -> x in Set(["Chromosome", "Complete Genome"]), ncbi_metadata[!, "assembly_level"]) .&
    map(x -> x in Set(["reference genome", "representative genome"]), ncbi_metadata[!, "refseq_category"]) .&
    (ncbi_metadata[!, "genome_size"] .> 10_000), :]

smallest_genomes = DataFrames.DataFrame()
for group_dataframe in DataFrames.groupby(filtered_ncbi_metadata, "group")
    smallest_genome_row = sort(group_dataframe, "genome_size")[1, :]
    push!(smallest_genomes, smallest_genome_row)
end
sort!(smallest_genomes, "genome_size")

In [None]:
selected_genomes = smallest_genomes[1:3, :]

In [None]:
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))
for row in DataFrames.eachrow(selected_genomes)
    records = collect(Mycelia.get_sequence(db="nuccore", ftp=Mycelia.ncbi_ftp_path_to_url(ftp_path=row["ftp_path"], extension="genomic.fna.gz"))) 
    outfile = joinpath(genome_dir, row["#assembly_accession"] * ".fna")
    Mycelia.write_fasta(outfile = outfile, records = records)
end