In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "FASTX",
    "XAM",
    "uCSV",
    "CodecZlib",
    "ProgressMeter",
    "StatsBase",
    "BioAlignments",
    "OrderedCollections",
    "StatsPlots",
    "Statistics"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
# rclone copy --progress --verbose --drive-chunk-size 2G --drive-upload-cutoff 1T --tpslimit 1 locus_data_warehouse:Genomics/SequelIIe/r64342e_20240416_185917/1_A01/bc2056--bc2056/m64342e_240416_193427.hifi_reads.bc2056--bc2056.bam /global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data
# /global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data
# tarchive = "/global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data/locus-reference-genomes.tar.gz"
# tarchive = "/global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data/SRR9202034.tar.gz"
# Mycelia.tar_extract(tarchive=tarchive)

In [None]:
# blast_db = "nt"
# blast_dbs_dir = joinpath([homedir(), "workspace", "blastdb"])
# blast_db_path = joinpath(blast_dbs_dir, blast_db)
# # 476.069645 seconds (123.76 k allocations: 8.292 MiB, 0.05% compilation time: 38% of which was recompilation)
# # 4321.050357 seconds (1.38 G allocations: 74.524 GiB, 86.31% gc time, 0.01% compilation time: 38% of which was recompilation)
# @time blast_db_taxonomy_table = Mycelia.load_blast_db_taxonomy_table(Mycelia.export_blast_db_taxonomy_table(path_to_db = blast_db_path))

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")

In [None]:
read_mapping_files = sort(filter(x -> occursin("m64342e_240416_193427.hifi_reads.bc2056--bc2056.bam", x) && occursin(r"\.sam\.gz$", x), readdir(data_dir, join=true)), by=x->filesize(x))

In [None]:
fastqs = sort(filter(x -> occursin(r"\.fq\.gz$", x), readdir(data_dir, join=true)), by=x->filesize(x))
# nt_mappings = filter(x -> occursin("nt.fna.gz", x), read_mapping_files)[1:5]
locus_internal_mappings = filter(x -> occursin("locus", x), read_mapping_files)

In [None]:
file_to_strain_relative_abundances = OrderedCollections.OrderedDict{String, Dict{String, Float64}}()
ProgressMeter.@showprogress for xam in locus_internal_mappings
    records_table = Mycelia.parse_xam_to_summary_table(xam)
    strain_relative_abundances = Mycelia.normalize_countmap(StatsBase.countmap(String.(first.(split.(records_table[records_table[!, "ismapped"] .& records_table[!, "isprimary"], "reference"], "__")))))
    file_to_strain_relative_abundances[xam] = strain_relative_abundances
end
file_to_strain_relative_abundances

In [None]:
unique_sorted_taxa = sort(collect(reduce(union, keys.(values(file_to_strain_relative_abundances)))))

In [None]:
n_samples = length(file_to_strain_relative_abundances)

In [None]:
abundance_matrix = zeros(length(unique_sorted_taxa), n_samples)
taxa_names_to_indices = Dict(t => i for (i, t) in enumerate(unique_sorted_taxa))
for (column, (file, abundances)) in enumerate(file_to_strain_relative_abundances)
    # @show column, sample
    for (taxa, relative_abundance) in abundances
        row = taxa_names_to_indices[taxa]
        abundance_matrix[row, column] = relative_abundance
    end
end
abundance_matrix

In [None]:
abundance_sort_perm = sortperm(abundance_matrix, dims=1)

In [None]:
abundance_matrix[abundance_sort_perm]

In [None]:
# Find the sort permutation of the row means vector
sort_perm = sortperm(vec(Statistics.mean(abundance_matrix, dims=2)))
file_labels = map(x -> !isnothing(match(r"P\d+(\.\d+)?(e-\d+)?", x)) ? match(r"P\d+(\.\d+)?(e-\d+)?", x).match : "full", basename.(keys(file_to_strain_relative_abundances)))
StatsPlots.groupedbar(
    abundance_matrix[sort_perm, :]',
    bar_position = :stack,
    bar_width=0.7, 
    label = permutedims(unique_sorted_taxa[sort_perm]), 
    xticks = (1:size(abundance_matrix, 2), file_labels), 
    xrotation = 45,
    ylabel = "proportion of reads", 
    xlabel = "Subsampling proportion",
    title = "Strain relative abundance (Urine spike in)",
    legend = :outertopright,
    size = (1000, 500),
    margins = 10StatsPlots.Plots.PlotMeasures.mm
)