In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "FASTX",
    "XAM",
    "uCSV",
    "CodecZlib",
    "ProgressMeter",
    "StatsBase",
    "BioAlignments",
    "OrderedCollections",
    "StatsPlots",
    "Statistics",
    "CSV"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
blast_db = "nt_prok"
blast_dbs_dir = joinpath([homedir(), "workspace", "blastdb"])
blast_db_path = joinpath(blast_dbs_dir, blast_db)

In [None]:
# 476.069645 seconds (123.76 k allocations: 8.292 MiB, 0.05% compilation time: 38% of which was recompilation)
# 4321.050357 seconds (1.38 G allocations: 74.524 GiB, 86.31% gc time, 0.01% compilation time: 38% of which was recompilation)
@time blast_db_taxonomy_table = Mycelia.load_blast_db_taxonomy_table(Mycelia.export_blast_db_taxonomy_table(path_to_db = blast_db_path))

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")

In [None]:
xams = sort(filter(x -> occursin(r"\.sam\.gz$", x), readdir(joinpath(data_dir, "SRR9202034"), join=true)), by=x->filesize(x))[1:3]

In [None]:
species_percent_abundances = sort([
    "Acinetobacter baumannii" => 0.18,
    "Bacillus pacificus" => 1.80,
    "Phocaeicola vulgatus" => 0.02,
    "Bifidobacterium adolescentis" => 0.02,
    "Clostridium beijerinckii" => 1.80,
    "Cutibacterium acnes" => 0.18,
    "Deinococcus radiodurans" => 0.02,
    "Enterococcus faecalis" => 0.02,
    "Escherichia coli" => 18.0,
    "Helicobacter pylori" => 0.18,
    "Lactobacillus gasseri" => 0.18,
    "Neisseria meningitidis" => 0.18,
    "Porphyromonas gingivalis" => 18.0,
    "Pseudomonas paraeruginosa" => 1.80,
    "Cereibacter sphaeroides" => 18.0,
    "Schaalia odontolytica" => 0.02,
    "Staphylococcus aureus" => 1.80,
    "Staphylococcus epidermidis" => 18.0,
    "Streptococcus agalactiae" => 1.80,
    "Streptococcus mutans" => 18.0
])

value_total = sum(last.(species_percent_abundances))
genus_relative_abundances = Dict(String(first(split(k))) => v/value_total for (k,v) in species_percent_abundances)

In [None]:
taxa_level = "genus"
file_to_taxa_relative_abundances = OrderedCollections.OrderedDict{String, Dict{String, Float64}}()
file_to_taxa_relative_abundances["reference"] = genus_relative_abundances

ProgressMeter.@showprogress for xam in xams
    @time record_table = Mycelia.parse_xam_to_mapped_records_table(xam)
    record_table = record_table[record_table[!, "isprimary"], :]
    record_table = DataFrames.innerjoin(record_table, blast_db_taxonomy_table, on="reference" => "sequence_id")
    unique_taxids = sort(unique(record_table[!, "taxid"]))
    record_table = DataFrames.innerjoin(record_table, Mycelia.taxids2taxonkit_summarized_lineage_table(unique_taxids), on="taxid")
    file_to_taxa_relative_abundances[basename(xam)] = Mycelia.normalize_countmap(StatsBase.countmap(record_table[!, taxa_level]))
end
file_to_taxa_relative_abundances

In [None]:
unique_sorted_taxa = sort(collect(reduce(union, keys.(values(file_to_taxa_relative_abundances)))))

In [None]:
n_samples = length(file_to_taxa_relative_abundances)

In [None]:
abundance_matrix = zeros(length(unique_sorted_taxa), n_samples)
taxa_names_to_indices = Dict(t => i for (i, t) in enumerate(unique_sorted_taxa))
for (column, (file, abundances)) in enumerate(file_to_taxa_relative_abundances)
    # @show column, sample
    for (taxa, relative_abundance) in abundances
        row = taxa_names_to_indices[taxa]
        abundance_matrix[row, column] = relative_abundance
    end
end
abundance_matrix

In [None]:
abundance_sort_perm = sortperm(abundance_matrix, dims=1)

In [None]:
# Find the sort permutation of the row means vector
sort_perm = sortperm(vec(Statistics.mean(abundance_matrix, dims=2)))
file_labels = map(x -> !isnothing(match(r"P\d+(\.\d+)?(e-\d+)?", x)) ? match(r"P\d+(\.\d+)?(e-\d+)?", x).match : "reference", basename.(keys(file_to_taxa_relative_abundances)))
StatsPlots.groupedbar(
    abundance_matrix[sort_perm, :]',
    bar_position = :stack,
    bar_width=0.7, 
    label = permutedims(unique_sorted_taxa[sort_perm]), 
    xticks = (1:size(abundance_matrix, 2), file_labels), 
    xrotation = 45,
    ylabel = "proportion of reads", 
    xlabel = "Subsampling proportion",
    title = "ATCC MSA-1003 reference",
    legend = :outertopright,
    size = (1000, 500),
    margins = 10StatsPlots.Plots.PlotMeasures.mm
)