In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "FASTX",
    "XAM",
    "uCSV",
    "CodecZlib",
    "ProgressMeter",
    "StatsBase",
    "Statistics",
    "CSV",
    "Random",
    "Distributions",
    "Plots",
    "OrderedCollections",
    "StatsPlots",
    "Colors"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")

In [None]:
RUN_ID = "r64342e_20240910_225812"

In [None]:
# copy over
# run(`rclone copy locus_data_warehouse:Genomics/SequelIIe/$(RUN_ID) $(data_dir)/$(RUN_ID)`)

In [None]:
xml = first(filter(x -> occursin(r"\.run\.metadata\.xml", x), readdir(joinpath(data_dir, RUN_ID, "1_A01"), join=true)))

In [None]:
sample_to_barcode_table = Mycelia.extract_pacbiosample_information(xml)
sample_to_barcode_table = sample_to_barcode_table[map(x -> !occursin(r"urine"i, x), sample_to_barcode_table[!, "BioSampleName"]), :]

In [None]:
barcode_directories = filter(x -> occursin(r"^bc\d+", basename(x)) && (basename(x) in Set(sample_to_barcode_table[!, "BarcodeName"])), readdir(joinpath(data_dir, RUN_ID, "1_A01"), join=true))

In [None]:
# map to blast NT
blast_db = "20240418.nt"
blast_dbs_dir = joinpath([homedir(), "workspace", "blastdb"])
# path_to_db = joinpath(homedir(), "workspace", "blastdb", blast_db)
blast_db_path = joinpath(blast_dbs_dir, blast_db)

# # path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")
# # compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)
compressed_fasta_export = blast_db_path * ".fna.gz"
@assert isfile(compressed_fasta_export)

In [None]:
# # map to Locus internal genomes
# # locus_fasta = "/global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data/locus-internal.fna.gz"

# locus_c_strain_fasta = joinpath(data_dir, "locus-c-strains.fna")

In [None]:
function filesize_human_readable(f)
    return Base.format_bytes(filesize(f))
end

In [None]:
# function minimap_map_with_index(
#         fasta = compressed_fasta_export,
#         mem_gb = Mycelia.NERSC_MEM,
#         mapping_type = "map-hifi",
#         threads = Mycelia.NERSC_CPU,
#         fastq
#         as_string=false,
#         denominator=6
#     )

In [None]:
# Mycelia.minimap_map_with_index(
#         fasta = compressed_fasta_export,
#         mem_gb = Mycelia.NERSC_MEM,
#         mapping_type = "map-hifi",
#         threads = Mycelia.NERSC_CPU,
#         fastq = fastq_file,
#         as_string=true,
#         denominator=6
#     )

In [None]:
cmd, outfile = Mycelia.minimap_index(
    fasta=compressed_fasta_export,
    mem_gb=Mycelia.NERSC_MEM,
    mapping_type="map-hifi",
    threads=1,
    as_string=true,
    denominator=6)
isfile(outfile)

In [None]:
present_xams = String[]

# here I made a copy of the fastq file to try submitting to the premium queue
# premium queue jumps straight to the front of the line, but caps @ 5 submissions

for (i, d) in enumerate(barcode_directories)
    # println("$(i)\t$(d)")
    # println(d)
    bam_file = first(filter(x -> occursin(r"\.bam", x), readdir(d, join=true)))
    # display(filesize_human_readable(bam_file))
    # 5655.810957 seconds
    # 111.551031 seconds (70.00 k allocations: 4.719 MiB, 0.11% compilation time)
    fastq_file = Mycelia.bam_to_fastq(bam=bam_file)
    # println(filesize_human_readable(fastq_file))

    cmd, outfile = Mycelia.minimap_map_with_index(
            fasta = compressed_fasta_export,
            mem_gb = Mycelia.NERSC_MEM,
            mapping_type = "map-hifi",
            threads = Mycelia.NERSC_CPU,
            fastq = fastq_file,
            as_string=true,
            denominator=6
        )

    # cmd, outfile = Mycelia.minimap_map(fasta=locus_c_strain_fasta, fastq=fastq_file, mem_gb=Mycelia.NERSC_MEM, mapping_type="map-hifi", threads=Mycelia.NERSC_CPU, as_string=true)
    # cmd, outfile = Mycelia.minimap_map(fasta=compressed_fasta_export, fastq=fastq_file, mem_gb=Mycelia.NERSC_MEM, mapping_type="map-hifi", threads=Mycelia.NERSC_CPU, as_string=true)
    # display(outfile)
    # display(cmd)
    # display(outfile)
    if !isfile(outfile)
        println(outfile)
        # Mycelia.nersc_sbatch_premium(
        # Mycelia.nersc_sbatch_regular(
        #     job_name=basename(outfile),
        #     mail_user="cameron.prybol@gmail.com",
        #     logdir=mkpath("$(homedir())/workspace/slurmlogs"),
        #     mem_gb=Mycelia.NERSC_MEM,
        #     cpus_per_task=Mycelia.NERSC_CPU,
        #     cmd=cmd)
    else
        push!(present_xams, outfile)
    end
end
present_xams

In [None]:
# trigger analysis

# find bam files and read them in - use standard microbiome taxonomy analysis from 

barcode_to_xam = Dict(basename(dirname(x)) => x for x in present_xams)

# barcode_to_xam = Dict()
# for barcode_directory in barcode_directories
#     barcode = basename(barcode_directory)
#     # && occursin("locus-c-strains.fna", x)
#     xams = filter(x -> occursin(Mycelia.XAM_REGEX, x) && occursin(".nt.fna.gz", x), readdir(barcode_directory, join=true))
#     @show xams
#     xam = first(xams)
#     # println("$(barcode)\t$(xam)")
#     barcode_to_xam[barcode] = xam
# end
# barcode_to_xam
sample_to_barcode_table[!, "xam"] = [haskey(barcode_to_xam, barcode) ? barcode_to_xam[barcode] : missing for barcode in sample_to_barcode_table[!, "BarcodeName"]]
sample_to_barcode_table = DataFrames.dropmissing(sample_to_barcode_table)

# Define the data as a multi-line string
data_str = """
Sample Type	Visit	Time (hr)	LIMS ID	Primary/Backup	Patient ID
Stool	1	0	897	Primary	103-005
Stool	2	0	898	Primary	103-005
Stool	5	0	907	Primary	103-005
Stool	7	0	1013	Primary	103-005
Stool	8	0	1576	Primary	103-005
Stool	1	0	1551	Primary	103-007
Stool	2	0	1552	Primary	103-007
Stool	5	0	1563	Primary	103-007
Stool	7	0	1738	Primary	103-007
Stool	8	0	2539	Primary	103-007
Stool	1	0	1728	Primary	103-008
Stool	2	0	1739	Primary	103-008
Stool	5	0	1748	Primary	103-008
Stool	7	0	1911	Primary	103-008
Stool	8	0	2645	Primary	103-008
Stool	1	0	5226	Primary	103-013
Stool	5	0	5312	Primary	103-013
Stool	7	0	5502	Primary	103-013
Stool	8	0	6856	Primary	103-013
Stool	1	0	5479	Primary	103-015
Stool	2	0	5503	Primary	103-015
Stool	5	0	5541	Primary	103-015
Stool	7	0	5998	Primary	103-015
Stool	8	0	6890	Primary	103-015
Stool	1	0	2803	Primary	106-006
Stool	5	0	3048	Primary	106-006
Stool	7	0	3084	Primary	106-006
"""

# Read the data using CSV.jl
metadata_table = CSV.read(IOBuffer(data_str), DataFrames.DataFrame; delim='\t', ignorerepeated=true)

sample_to_barcode_table[!, "LIMS ID"] = parse.(Int, last.(split.(sample_to_barcode_table[!, "BioSampleName"], "_")))

metadata_table = DataFrames.innerjoin(metadata_table, sample_to_barcode_table, on="LIMS ID")

# visit time is all 0
# , "T" * string(r["Time (hr)"])

In [None]:
metadata_table[!, "patient_first_id"] = [join([r["Patient ID"], "V" * string(r["Visit"])], " ") for r in DataFrames.eachrow(metadata_table)]

In [None]:
metadata_table[!, "visit_first_id"] = [join([ "V" * string(r["Visit"]), r["Patient ID"]], " ") for r in DataFrames.eachrow(metadata_table)]

In [None]:
metadata_table

In [None]:
# map to blast NT
blast_db = "nt"
blast_dbs_dir = joinpath([homedir(), "workspace", "blastdb"])
# path_to_db = joinpath(homedir(), "workspace", "blastdb", blast_db)
blast_db_path = joinpath(blast_dbs_dir, blast_db)

# # path_to_db = Mycelia.download_blast_db(db=db, source="ncbi")
# # compressed_fasta_export = Mycelia.export_blast_db(path_to_db = path_to_db)

In [None]:
@time blast_db_taxonomy_table = Mycelia.load_blast_db_taxonomy_table(Mycelia.export_blast_db_taxonomy_table(path_to_db = blast_db_path))

In [None]:
# taxa_level = "species"
taxa_level = "genus"
file_to_taxa_relative_abundances = OrderedCollections.OrderedDict{String, Dict{Union{Missing, String}, Float64}}()
ProgressMeter.@showprogress for xam in metadata_table[!, "xam"]
    @time record_table = Mycelia.parse_xam_to_mapped_records_table(xam)
    record_table = record_table[record_table[!, "isprimary"], :]
    record_table = DataFrames.innerjoin(record_table, blast_db_taxonomy_table, on="reference" => "sequence_id")
    unique_taxids = sort(unique(record_table[!, "taxid"]))
    record_table = DataFrames.innerjoin(record_table, Mycelia.taxids2taxonkit_summarized_lineage_table(unique_taxids), on="taxid")
    file_to_taxa_relative_abundances[xam] = Mycelia.normalize_countmap(StatsBase.countmap(record_table[!, taxa_level]))
end
file_to_taxa_relative_abundances

In [None]:
unique_sorted_taxa = unique(sort(collect(reduce(union, keys.(values(file_to_taxa_relative_abundances))))))
n_samples = length(file_to_taxa_relative_abundances)
abundance_matrix = zeros(length(unique_sorted_taxa), n_samples)
taxa_names_to_indices = Dict(t => i for (i, t) in enumerate(unique_sorted_taxa))
for (column, (file, abundances)) in enumerate(file_to_taxa_relative_abundances)
    # @show column, sample
    for (taxa, relative_abundance) in abundances
        row = taxa_names_to_indices[taxa]
        abundance_matrix[row, column] = relative_abundance
    end
end
abundance_matrix

file_to_identifier = Dict(row["xam"] => row["patient_first_id"] for row in DataFrames.eachrow(metadata_table))
# file_to_identifier = Dict(row["xam"] => row["visit_first_id"] for row in DataFrames.eachrow(metadata_table))

file_labels = [file_to_identifier[k] for k in keys(file_to_taxa_relative_abundances)]

abundance_matrix = abundance_matrix[:, sortperm(file_labels)]
file_labels = sort(file_labels)

# drop human and missing
filtered_indices = findall(x -> !(x in Set(["Homo", missing])), vec(unique_sorted_taxa))
unique_sorted_taxa = unique_sorted_taxa[filtered_indices]
abundance_matrix = abundance_matrix[filtered_indices, :]

# Calculate the sum of each col
col_sums = sum(abundance_matrix, dims=1)
# Normalize each element by dividing by the col sum
abundance_matrix = abundance_matrix ./ col_sums

# # vaginal_indices
# indices = [1, 2, 5, 6]
# label = "vaginal"
# top_N = 6

# perianal_indices
# indices = [3, 4, 7, 8]
# label = "perianal"
top_N = 50

# file_labels_subset = file_labels[indices]
# abundance_matrix_subset = abundance_matrix[:, indices]

sort_perm = sortperm(vec(Statistics.sum(abundance_matrix, dims=2)))
unique_sorted_taxa_subset = unique_sorted_taxa[sort_perm]
abundance_matrix_subset = abundance_matrix[sort_perm, :]
non_zero_indices = findall(vec(Statistics.sum(abundance_matrix_subset, dims=2)) .> 0.0)
unique_sorted_taxa_subset = unique_sorted_taxa_subset[non_zero_indices]
abundance_matrix_subset = abundance_matrix_subset[non_zero_indices, :]
colorscheme = Colors.distinguishable_colors(length(unique_sorted_taxa_subset), [Colors.RGB(1,1,1), Colors.RGB(0,0,0)], dropseed=true)

scaler = 800

StatsPlots.groupedbar(
    abundance_matrix_subset'[:, end-(top_N-1):end],
    bar_position = :stack,
    bar_width=0.7, 
    label = hcat(unique_sorted_taxa_subset...)[:, end-(top_N-1):end], 
    xticks = (1:length(file_labels), sort(file_labels)), 
    xrotation = 45,
    ylabel = "proportion of reads",
    # xlabel = "$(label) sample",
    # title = "$(taxa_level) relative abundance (top $(top_N-2))",
    title = "$(taxa_level) relative abundance (top $(top_N) classified and non-human)",
    legend = :outertopright,
    # legend = false,
    size = (2*scaler, scaler),
    margins = 15StatsPlots.Plots.PlotMeasures.mm,
    seriescolor = hcat(reverse(colorscheme)...)[:, end-(top_N-1):end]
)