In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "FASTX",
    "XAM",
    "uCSV",
    "CodecZlib",
    "ProgressMeter",
    "StatsBase",
    "Statistics",
    "CSV",
    "Random",
    "Distributions",
    "Plots",
    "OrderedCollections",
    "StatsPlots",
    "Colors"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")

In [None]:
RUN_ID = "r64342e_20240621_140056"

In [None]:
# copy over
# run(`rclone copy locus_data_warehouse:Genomics/SequelIIe/$(RUN_ID) $(data_dir)/$(RUN_ID)`)

In [None]:
xml = first(filter(x -> occursin(r"\.run\.metadata\.xml", x), readdir(joinpath(data_dir, RUN_ID, "1_A01"), join=true)))

In [None]:
sample_to_barcode_table = Mycelia.extract_pacbiosample_information(xml)

In [None]:
barcode_directories = filter(x -> occursin(r"^bc\d+", basename(x)), readdir(joinpath(data_dir, RUN_ID, "1_A01"), join=true))

In [None]:
# map to blast NT
blast_db = "nt"
blast_dbs_dir = joinpath([homedir(), "workspace", "blastdb"])
# path_to_db = joinpath(homedir(), "workspace", "blastdb", blast_db)
blast_db_path = joinpath(blast_dbs_dir, blast_db)

# # path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")
# # compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
# compressed_fasta_export = blast_db_path * ".fna.gz"

In [None]:
# map to Locus internal genomes
# locus_fasta = "/global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data/locus-internal.fna.gz"

In [None]:
# for d in barcode_directories
#     bam_file = first(filter(x -> occursin(r"\.bam", x), readdir(d, join=true)))
#     # 5655.810957 seconds
#     # 111.551031 seconds (70.00 k allocations: 4.719 MiB, 0.11% compilation time)
#     fastq_file = Mycelia.bam_to_fastq(bam=bam_file)

#     cmd, outfile = Mycelia.minimap_map(fasta=compressed_fasta_export, fastq=fastq_file, mem_gb=Mycelia.NERSC_MEM, mapping_type="map-hifi", threads=Mycelia.NERSC_CPU, as_string=true)
#     # display(outfile)
#     if !isfile(outfile)
#         println(outfile)
#         Mycelia.nersc_sbatch_regular(
#             job_name=basename(outfile),
#             mail_user="cameron.prybol@gmail.com",
#             logdir=mkpath("$(homedir())/workspace/slurmlogs"),
#             mem_gb=Mycelia.NERSC_MEM,
#             cpus_per_task=Mycelia.NERSC_CPU,
#             cmd=cmd)
#     end
# end

In [None]:
# here I made a copy of the fastq file to try submitting to the premium queue
# premium queue jumps straight to the front of the line, but caps @ 5 submissions

# for d in barcode_directories
#     bam_file = first(filter(x -> occursin(r"\.bam", x), readdir(d, join=true)))
#     # 5655.810957 seconds
#     # 111.551031 seconds (70.00 k allocations: 4.719 MiB, 0.11% compilation time)
#     fastq_file = Mycelia.bam_to_fastq(bam=bam_file)
#     fastq_file_alt = replace(fastq_file, ".gz" => ".copy.gz")
#     # display(fastq_file_alt)
#     if !isfile(fastq_file_alt)
#         cp(fastq_file, fastq_file_alt)
#     end

#     cmd, outfile = Mycelia.minimap_map(fasta=compressed_fasta_export, fastq=fastq_file_alt, mem_gb=Mycelia.NERSC_MEM, mapping_type="map-hifi", threads=Mycelia.NERSC_CPU, as_string=true)
#     # display(outfile)
#     if !isfile(outfile)
#         println(outfile)
#         Mycelia.nersc_sbatch_premium(
#             job_name=basename(outfile),
#             mail_user="cameron.prybol@gmail.com",
#             logdir=mkpath("$(homedir())/workspace/slurmlogs"),
#             mem_gb=Mycelia.NERSC_MEM,
#             cpus_per_task=Mycelia.NERSC_CPU,
#             cmd=cmd)
#     end
# end

In [None]:
# trigger analysis

In [None]:
# find bam files and read them in - use standard microbiome taxonomy analysis from 

In [None]:
barcode_to_xam = Dict()
for barcode_directory in barcode_directories
    barcode = basename(barcode_directory)
    xams = filter(x -> occursin(Mycelia.XAM_REGEX, x) && occursin("nt.fna.gz", x), readdir(barcode_directory, join=true))
    xam = first(xams)
    # println("$(barcode)\t$(xam)")
    barcode_to_xam[barcode] = xam
end
barcode_to_xam
sample_to_barcode_table[!, "xam"] = [barcode_to_xam[barcode] for barcode in sample_to_barcode_table[!, "BarcodeName"]]
sample_to_barcode_table

In [None]:
xams = sample_to_barcode_table[!, "xam"]

In [None]:
@time blast_db_taxonomy_table = Mycelia.load_blast_db_taxonomy_table(Mycelia.export_blast_db_taxonomy_table(path_to_db = blast_db_path))

In [None]:
# taxa_level = "species"
taxa_level = "genus"
# taxa_level = "family"
file_to_taxa_relative_abundances = OrderedCollections.OrderedDict{String, Dict{Union{Missing, String}, Float64}}()
ProgressMeter.@showprogress for xam in xams
    @time record_table = Mycelia.parse_xam_to_mapped_records_table(xam)
    record_table = record_table[record_table[!, "isprimary"], :]
    record_table = DataFrames.innerjoin(record_table, blast_db_taxonomy_table, on="reference" => "sequence_id")
    unique_taxids = sort(unique(record_table[!, "taxid"]))
    record_table = DataFrames.innerjoin(record_table, Mycelia.taxids2taxonkit_summarized_lineage_table(unique_taxids), on="taxid")
    Mycelia.normalize_countmap(StatsBase.countmap(record_table[!, taxa_level]))
    file_to_taxa_relative_abundances[xam] = Mycelia.normalize_countmap(StatsBase.countmap(record_table[!, taxa_level]))
end
file_to_taxa_relative_abundances

In [None]:
unique_sorted_taxa = sort(collect(reduce(union, keys.(values(file_to_taxa_relative_abundances)))))

In [None]:
n_samples = length(file_to_taxa_relative_abundances)

In [None]:
abundance_matrix = zeros(length(unique_sorted_taxa), n_samples)
taxa_names_to_indices = Dict(t => i for (i, t) in enumerate(unique_sorted_taxa))
for (column, (file, abundances)) in enumerate(file_to_taxa_relative_abundances)
    # @show column, sample
    for (taxa, relative_abundance) in abundances
        row = taxa_names_to_indices[taxa]
        abundance_matrix[row, column] = relative_abundance
    end
end
abundance_matrix

In [None]:
abundance_sort_perm = sortperm(abundance_matrix, dims=1)

In [None]:
abundance_matrix[abundance_sort_perm]

In [None]:
file_to_identifier = Dict(row["xam"] => row["BioSampleName"] for row in DataFrames.eachrow(sample_to_barcode_table))

In [None]:
colorscheme = Colors.distinguishable_colors(length(unique_sorted_taxa), [Colors.RGB(1,1,1), Colors.RGB(0,0,0)], dropseed=true)

In [None]:
sortperm(file_labels)

In [None]:
## BASE - INCLUDES EVERYTHING
# Find the sort permutation of the row means vector
sort_perm = sortperm(vec(Statistics.mean(abundance_matrix, dims=2)))
file_labels = [file_to_identifier[k] for k in keys(file_to_taxa_relative_abundances)]
label_sort_perm = sortperm(file_labels)

StatsPlots.groupedbar(
    abundance_matrix[sort_perm, label_sort_perm]',
    bar_position = :stack,
    bar_width=0.7, 
    # label = permutedims(unique_sorted_taxa[sort_perm]),
    label = false,
    xticks = (1:size(abundance_matrix, 2), file_labels[label_sort_perm]), 
    xrotation = 45,
    ylabel = "proportion of reads", 
    xlabel = "Subsampling proportion",
    title = "$(taxa_level) relative abundance",
    legend = :outertopright,
    size = (1000, 500),
    margins = 10StatsPlots.Plots.PlotMeasures.mm,
    seriescolor = hcat(reverse(colorscheme)...)
)

In [None]:
top_N = 30
# Find the sort permutation of the row means vector
sort_perm = sortperm(vec(Statistics.mean(abundance_matrix, dims=2)))
file_labels = [file_to_identifier[k] for k in keys(file_to_taxa_relative_abundances)]
label_sort_perm = sortperm(file_labels)
StatsPlots.groupedbar(
    abundance_matrix[sort_perm, label_sort_perm]'[:, end-(top_N-1):end],
    bar_position = :stack,
    bar_width=0.7, 
    label = permutedims(unique_sorted_taxa[sort_perm])[:, end-(top_N-1):end], 
    xticks = (1:size(abundance_matrix, 2), file_labels[label_sort_perm]), 
    xrotation = 45,
    ylabel = "proportion of reads", 
    xlabel = "Subsampling proportion",
    title = "$(taxa_level) relative abundance (Top $(top_N))",
    legend = :outertopright,
    size = (1000, 500),
    margins = 10StatsPlots.Plots.PlotMeasures.mm,
    seriescolor = hcat(reverse(colorscheme)...)[:, end-(top_N-1):end]
)