In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "FASTX",
    "XAM",
    "uCSV",
    "CodecZlib",
    "ProgressMeter",
    "StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
# rclone copy --progress --verbose --drive-chunk-size 2G --drive-upload-cutoff 1T --tpslimit 1 locus_data_warehouse:Genomics/SequelIIe/r64342e_20240416_185917/1_A01/bc2056--bc2056/m64342e_240416_193427.hifi_reads.bc2056--bc2056.bam /global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data
# /global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data
# tarchive = "/global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data/locus-reference-genomes.tar.gz"
# tarchive = "/global/homes/c/cjprybol/workspace/Mycelia/projects/metagenome/data/SRR9202034.tar.gz"
# Mycelia.tar_extract(tarchive=tarchive)

In [None]:
blast_db = "nt_prok"
blast_dbs_dir = joinpath([homedir(), "workspace", "blastdb"])
blast_db_path = joinpath(blast_dbs_dir, blast_db)
@time blast_db_taxonomy_table = Mycelia.load_blast_db_taxonomy_table(Mycelia.export_blast_db_taxonomy_table(path_to_db = blast_db_path))

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")
sample_dir = joinpath(data_dir, "SRR9202034")

mapped_reads = sort(filter(x -> occursin(r"\.sam\.gz", x), readdir(sample_dir, join=true)), by=x->filesize(x))
for x in mapped_reads
    println(x)
end

In [None]:
file_to_counts = Dict{String, Vector{Int}}()

In [None]:
ProgressMeter.@showprogress for f in mapped_reads[1:3]
    if !haskey(file_to_counts, f)
        records, header = Mycelia.parse_xam(f)
        records_table = Mycelia.xam_records_to_dataframe(records)
        records_table_with_taxids = DataFrames.innerjoin(records_table, blast_db_taxonomy_table, on="reference" => "sequence_id")
        primary_mapping_taxids = records_table_with_taxids[records_table_with_taxids[!, "ismapped"] .& records_table_with_taxids[!, "isprimary"], "taxid"]
        file_to_counts[f] = primary_mapping_taxids
    end
end

In [None]:
length.(values(file_to_counts))

In [None]:
species_relative_abundances = [
    "Acinetobacter baumannii" => 0.18,
    "Bacillus pacificus" => 1.80,
    "Phocaeicola vulgatus" => 0.02,
    "Bifidobacterium adolescentis" => 0.02,
    "Clostridium beijerinckii" => 1.80,
    "Cutibacterium acnes" => 0.18,
    "Deinococcus radiodurans" => 0.02,
    "Enterococcus faecalis" => 0.02,
    "Escherichia coli" => 18.0,
    "Helicobacter pylori" => 0.18,
    "Lactobacillus gasseri" => 0.18,
    "Neisseria meningitidis" => 0.18,
    "Porphyromonas gingivalis" => 18.0,
    "Pseudomonas paraeruginosa" => 1.80,
    "Cereibacter sphaeroides" => 18.0,
    "Schaalia odontolytica" => 0.02,
    "Staphylococcus aureus" => 1.80,
    "Staphylococcus epidermidis" => 18.0,
    "Streptococcus agalactiae" => 1.80,
    "Streptococcus mutans" => 18.0
]

In [None]:
names2taxid_table = Mycelia.names2taxids(first.(species_relative_abundances))

In [None]:
if !isdefined(Main, :taxonomy_table)
    # Call the function if variable_name is not defined
    taxonomy_table = Mycelia.list_full_taxonomy()
end
if !isdefined(Main, :species_table)
    # Call the function if variable_name is not defined
    species_table = Mycelia.list_species()
end
taxids = Set(species_table[!, "taxid"])

# 5s
@time species_taxonomy_table = taxonomy_table[map(x -> x in taxids, taxonomy_table[!, "taxid"]), :]
species_taxonomy_table = species_taxonomy_table[map(x -> !isempty(x), species_taxonomy_table[!, "superkingdom_taxid"]), :]

In [None]:
# names2taxids_dict = Dict(row["name"] => row["taxid"] for row in DataFrames.eachrow(names2taxid_table))

In [None]:
file_to_taxhits = Dict(f => StatsBase.countmap(counts) for (f, counts) in file_to_counts)

In [None]:
# use taxonomy table to link whatever the final taxa id (subspecies or strain usually) to species taxid
taxonomy_table[map(x -> x in sorted_observed_taxids_set, taxonomy_table[!, "taxid"]), :]

In [None]:
taxid_to_relative_abundances_reference = Dict(names2taxids_dict[n] => v for (n, v) in species_relative_abundances)

In [None]:

file_to_tax_relative_abundances = Dict{String, Dict{Int, Float64}}()
for (f, tax_counts) in file_to_taxcounts
    total_count = sum(values(tax_counts))
    file_to_tax_relative_abundances[f] = Dict(tax => count / total_count for (tax, count) in tax_counts)
end
file_to_tax_relative_abundances = Dict(basename(f) => v for (f,v) in file_to_tax_relative_abundances)
file_to_tax_relative_abundances

In [None]:
sorted_observed_taxids = sort(reduce(union, collect.(keys.(values(file_to_tax_relative_abundances)))))

In [None]:
taxid_to_sorted_index = Dict(taxid => i for (i, taxid) in enumerate(sorted_observed_taxids))

In [None]:
ordered_samples = [
    "SRR9202034.seqkit.P0.0001.fq.gz.nt_prok.fna.gz.minimap2.sam.gz",
    "SRR9202034.seqkit.P0.001.fq.gz.nt_prok.fna.gz.minimap2.sam.gz",
    "SRR9202034.seqkit.P0.01.fq.gz.nt_prok.fna.gz.minimap2.sam.gz"
]

In [None]:
taxids_to_relative_abundances = [taxid_to_relative_abundances_reference, [file_to_tax_relative_abundances[f] for f in ordered_samples]...]

In [None]:
labels = vcat(["reference"], ordered_samples)

In [None]:
sorted_observed_taxids_set = Set(sorted_observed_taxids)

In [None]:
counts_matrix = zeros(length(sorted_observed_taxids), length(labels))
for (column, sample) in enumerate(taxids_to_relative_abundances)
    # @show column, sample
    for (taxid, count) in sample
        row = taxid_to_sorted_index[taxid]
        counts_matrix[row, column] = count
    end
end
counts_matrix

In [None]:
column_names = first.(results)

row_names = names2taxid_table[!, "name"]

# Create the stacked barplot
    # 
StatsPlots.groupedbar(
    counts_matrix',
    bar_position = :stack,
    bar_width=0.7, 
    label = permutedims(row_names), 
    xticks = (1:size(counts_matrix, 2), column_names), 
    xrotation = 45,
    ylabel = "# of reads", 
    xlabel = "Sample", 
    title = "Species absolute abundance plot",
    legend = :outertopright,
    size = (1000, 500),
    margins = 10StatsPlots.Plots.PlotMeasures.mm
)

In [None]:
relative_abundance_matrix = Mycelia.count_matrix_to_probability_matrix(counts_matrix)

StatsPlots.groupedbar(
    relative_abundance_matrix',
    bar_position = :stack,
    bar_width=0.7, 
    label = permutedims(row_names), 
    xticks = (1:size(counts_matrix, 2), column_names), 
    xrotation = 45,
    ylabel = "proportion of reads", 
    xlabel = "Sample", 
    title = "Species relative abundance plot",
    legend = :outertopright,
    size = (1000, 500),
    margins = 10StatsPlots.Plots.PlotMeasures.mm
)

In [None]:
cosine_distance_matrix = Mycelia.frequency_matrix_to_cosine_distance_matrix(relative_abundance_matrix)

In [None]:
fit_pca = MultivariateStats.fit(MultivariateStats.PCA, cosine_distance_matrix)
transformed_observations = MultivariateStats.transform(fit_pca, cosine_distance_matrix')

xs = [[transformed_observations[1, i]] for i in 1:9]
ys = [[transformed_observations[2, i]] for i in 1:9]

plot = 
StatsPlots.scatter(
    xs,
    ys,
    xlabel = "PC1",
    ylabel = "PC2",
    labels = permutedims(column_names),
    title = "PCA of relative abundance data",
    legend = :outertopright,
    margins = 20StatsPlots.px,
    dpi=300,
    size=(600, 400)
)