In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "FASTX",
    "XAM",
    "uCSV",
    "CodecZlib",
    "ProgressMeter",
    "StatsBase",
    "Statistics",
    "CSV",
    "Random",
    "Distributions",
    "Plots"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
function zipf_distribution(n)
  return 1 ./ collect(1:n)
end

function dirichlet_distribution(alpha)
  return rand(Distributions.Dirichlet(alpha))
end

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")

In [None]:
@time ncbi_species_table = Mycelia.list_species();

In [None]:
@time refseq_metadata_table = Mycelia.load_refseq_metadata();

In [None]:
# function subsample(t, n)
#     return t[StatsBase.sample(1:DataFrames.nrow(t), N, ordered=true, replace=false), :]
# end

function sample_evenly_by_group(df, column, N)
    gdf = collect(DataFrames.groupby(df, column))
    return_df = DataFrames.DataFrame()
    while DataFrames.nrow(return_df) < N
        this_gdf = rand(gdf)
        row = rand(1:DataFrames.nrow(this_gdf))
        sampled_row = this_gdf[row, :]
        push!(return_df, sampled_row)
    end
    return return_df
end

N = 1
Random.seed!(N)

In [None]:
# e coli
# ec_taxa_ids = Set(Mycelia.list_subtaxa(562))
# ec_metadata_table = refseq_metadata_table[map(x -> x in ec_taxa_ids, refseq_metadata_table[!, "species_taxid"]), :]
# 37891 distinct accessions
ec_metadata_table = refseq_metadata_table[refseq_metadata_table[!, "species_taxid"] .== 562, :]
ec_metadata_table = sample_evenly_by_group(ec_metadata_table, "taxid", N)
# 2271 distinct taxids
# sort(collect(StatsBase.countmap(ec_metadata_table[!, "taxid"])), by=x->x[2], rev=true)
# ec_metadata_table = subsample(ec_metadata_table, N)

In [None]:
# staph
# sa_taxa_ids = Set(Mycelia.list_subtaxa(1280))
# 16411 distinct accessions
sa_metadata_table = refseq_metadata_table[refseq_metadata_table[!, "species_taxid"] .== 1280, :]
# 4109 distinct taxids
# sort(collect(StatsBase.countmap(sa_metadata_table[!, "taxid"])), by=x->x[2], rev=true)
sa_metadata_table = sample_evenly_by_group(sa_metadata_table, "taxid", N)

In [None]:
# pseudomonas
# pa_taxa_ids = Set(Mycelia.list_subtaxa(287))
# 9555 distinct accessions
pa_metadata_table = refseq_metadata_table[refseq_metadata_table[!, "species_taxid"] .== 287, :]
# 248 distinct taxids
# sort(collect(StatsBase.countmap(pa_metadata_table[!, "taxid"])), by=x->x[2], rev=true)
pa_metadata_table = sample_evenly_by_group(pa_metadata_table, "taxid", N)

In [None]:
# taxa_ids_of_interest = Set(union(ec_taxa_ids, sa_taxa_ids, pa_taxa_ids))
# accessions_of_interest_metadata_table = refseq_metadata_table[map(x -> x in taxa_ids_of_interest, refseq_metadata_table[!, "taxid"]) .| map(x -> x in taxa_ids_of_interest, refseq_metadata_table[!, "species_taxid"]), :]

In [None]:
accessions_of_interest_metadata_table = vcat(ec_metadata_table, sa_metadata_table, pa_metadata_table)

In [None]:
accessions_of_interest_metadata_table[!, "out_directory"] .= ""
ProgressMeter.@showprogress for (i, accession) in enumerate(accessions_of_interest_metadata_table[!, "#assembly_accession"])
    # @show accession
    # Mycelia.ncbi_genome_download_accession
    accessions_of_interest_metadata_table[i, "out_directory"] = Mycelia.ncbi_genome_download_accession(
        accession=accession,
        outdir = joinpath(data_dir, "ncbi-genomes")
        # include_string="gbff"
    )
end
accessions_of_interest_metadata_table[!, "out_directory"]

In [None]:
# I can use the LOCUS line in the genbank files to specify genome topology, but don't bother with that right now

In [None]:
# read in each genome, and tag the relative abundance (and eventually, topology) in the joined fasta files

In [None]:
n_genomes = DataFrames.nrow(accessions_of_interest_metadata_table)

In [None]:
# dirichlet_distribution(zipf_distribution(10^3))
raw_abundances = zipf_distribution(n_genomes)
normalized_abundances = raw_abundances ./ sum(raw_abundances)

In [None]:
accessions_of_interest_metadata_table

In [None]:
accessions_of_interest_metadata_table[!, "relative_abundances"] = normalized_abundances

In [None]:
accession_dir = first(accessions_of_interest_metadata_table[!, "out_directory"])

In [None]:
accession_fasta = first(filter(x -> occursin(Mycelia.FASTA_REGEX, x), readdir(accession_dir, join=true)))