In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
# Pkg.activate(;temp=true)
Pkg.activate("20240726.minimap2-reads-cleanup")
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "uCSV",
    "OrderedCollections",
    "CSV"
    # "XAM",
    # "CodecZlib"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
db = "nt"
path_to_db = joinpath(homedir(), "workspace", "blastdb", db)
# path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")
# compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
compressed_fasta_export = path_to_db * ".fna.gz"

In [None]:
threads = 12
mem_gb = threads * 32
# mem_gb = # Int(floor(mem_gb * .8))

In [None]:
sra_dirs = readdir(joinpath(data_dir, "SRA"), join=true)

In [None]:
# sra_dir = first(sra_dirs)
for sra_dir in sra_dirs
    trim_galore_dir_contents = readdir(joinpath(sra_dir, "trim_galore"), join=true)
    forward = first(filter(f -> occursin(r"_1_val_1\.fq\.gz$", f), trim_galore_dir_contents))
    reverse = first(filter(f -> occursin(r"_2_val_2\.fq\.gz$", f), trim_galore_dir_contents))
    map_result = Mycelia.minimap_map_paired_end_with_index(fasta = compressed_fasta_export, forward = forward, reverse =reverse, mem_gb = Int(floor(mem_gb * .8)), threads=threads, as_string=true)
    outfile = map_result.outfile
    cmd = map_result.cmd
    
    # sam_hits = filter(f -> occursin(r"\.sam$", f), trim_galore_dir_contents)
    tmp_sam_hits = filter(f -> occursin(r"\.tmp$", f) || occursin(r"\.sam$", f), trim_galore_dir_contents)

    for sam_hit in tmp_sam_hits
        display(sam_hit)
        display(Base.format_bytes(filesize(sam_hit)))
        rm(sam_hit)
    end
    # # resubmit if we run out of time
    # if !isfile(outfile)
    #     println(outfile)
    #     Mycelia.scg_sbatch(
    #         job_name=basename(outfile),
    #         mail_user="cameron.prybol@gmail.com",
    #         logdir=mkpath("$(homedir())/workspace/slurmlogs"),
    #         mem_gb=mem_gb,
    #         cpus_per_task=threads,
    #         partition="batch",
    #         account="mpsnyder",
    #         time="7-00:00:00",
    #         cmd=cmd)
    # else
    #     @show isfile(outfile)
    #     @show Base.format_bytes(filesize(outfile))
    # end
    # sleep(3)
end

In [None]:
@show isfile(outfile)

In [None]:
blast_db = "nt"
blast_dbs_dir = joinpath(homedir(), "workspace", "blastdb")
blast_db_path = joinpath(blast_dbs_dir, blast_db)

In [None]:
# 4321.050357 seconds (1.38 G allocations: 74.524 GiB, 86.31% gc time, 0.01% compilation time: 38% of which was recompilation)
# 1915.630315 seconds (133.26 k allocations: 6.557 MiB, 0.00% gc time, 0.01% compilation time)
# 1917.061201 seconds (595.03 k allocations: 32.412 MiB, 0.00% gc time, 0.02% compilation time)
@time blast_db_taxonomy_table_file = Mycelia.export_blast_db_taxonomy_table(path_to_db = blast_db_path)

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")

In [None]:
xam = outfile

In [None]:
# # filter unmapped
# samtools view -F 4 yourfile.sam.gz
# # 2308 = 4 (unmapped) + 256 (secondary) + 2048 (supplementary)
# yourfile.sam.gz
Mycelia.add_bioconda_env("samtools")
# 550.789620 seconds (481.85 M allocations: 11.977 GiB, 0.01% compilation time)
io = open(pipeline(`$(Mycelia.CONDA_RUNNER) run --live-stream -n samtools samtools view -F 2308 $(xam)`, `awk '{OFS="\t"}{print $1, $3}'`))
@time record_table = CSV.read(io, DataFrames.DataFrame, delim='\t', header=["template", "reference"])
close(io)
record_table

In [None]:
# 476.069645 seconds (123.76 k allocations: 8.292 MiB, 0.05% compilation time: 38% of which was recompilation)
# 318.219621 seconds (606.87 M allocations: 17.838 GiB, 6.44% gc time)
# 388.887537 seconds (1.19 G allocations: 28.142 GiB, 24.04% gc time)
@time blast_db_taxonomy_table = Mycelia.load_blast_db_taxonomy_table(blast_db_taxonomy_table_file)

In [None]:
taxa_level = "species"
file_to_taxa_relative_abundances = OrderedCollections.OrderedDict{String, Dict{String, Float64}}()
# ProgressMeter.@showprogress for xam in xams
record_table = DataFrames.innerjoin(record_table, blast_db_taxonomy_table, on="reference" => "sequence_id")
unique_taxids = sort(unique(record_table[!, "taxid"]))
record_table = DataFrames.innerjoin(record_table, Mycelia.taxids2taxonkit_summarized_lineage_table(unique_taxids), on="taxid")
file_to_taxa_relative_abundances[xam] = Mycelia.normalize_countmap(StatsBase.countmap(record_table[!, taxa_level]))
# end
file_to_taxa_relative_abundances

In [None]:
summarized_lineage_table = Mycelia.taxids2taxonkit_summarized_lineage_table(unique_taxids)

In [None]:
record_table = DataFrames.innerjoin(record_table, summarized_lineage_table, on="taxid")

In [None]:
# make a dictionary of record identifiers to taxa ids

# make in memory lists of taxids to records

# write each taxid list to a fasta file of that taxid in order

# after confirming the above works, submit the rest

In [None]:
# unique_sorted_taxa = sort(collect(reduce(union, keys.(values(file_to_taxa_relative_abundances)))))

In [None]:
# n_samples = length(file_to_taxa_relative_abundances)

In [None]:
# abundance_matrix = zeros(length(unique_sorted_taxa), n_samples)
# taxa_names_to_indices = Dict(t => i for (i, t) in enumerate(unique_sorted_taxa))
# for (column, (file, abundances)) in enumerate(file_to_taxa_relative_abundances)
#     # @show column, sample
#     for (taxa, relative_abundance) in abundances
#         row = taxa_names_to_indices[taxa]
#         abundance_matrix[row, column] = relative_abundance
#     end
# end
# abundance_matrix

In [None]:
# abundance_sort_perm = sortperm(abundance_matrix, dims=1)

In [None]:
# abundance_matrix[abundance_sort_perm]

In [None]:
# # Find the sort permutation of the row means vector
# sort_perm = sortperm(vec(Statistics.mean(abundance_matrix, dims=2)))
# file_labels = map(x -> !isnothing(match(r"P\d+(\.\d+)?(e-\d+)?", x)) ? match(r"P\d+(\.\d+)?(e-\d+)?", x).match : "full", basename.(keys(file_to_taxa_relative_abundances)))
# StatsPlots.groupedbar(
#     abundance_matrix[sort_perm, :]',
#     bar_position = :stack,
#     bar_width=0.7, 
#     label = permutedims(unique_sorted_taxa[sort_perm]), 
#     xticks = (1:size(abundance_matrix, 2), file_labels), 
#     xrotation = 45,
#     ylabel = "proportion of reads", 
#     xlabel = "Subsampling proportion",
#     title = "Species relative abundance (Urine spike in)",
#     legend = :outertopright,
#     size = (1000, 500),
#     margins = 10StatsPlots.Plots.PlotMeasures.mm
# )

In [None]:
# Mycelia.parse_xam_to_mapped_records_table