In [None]:
import Pkg

pkgs = [
"DataFrames",
"StatsBase",
"StatsPlots",
"uCSV",
"ProgressMeter",
"Distances",
"Clustering",
"Colors",
"MultivariateStats",
"Dates",
"CategoricalArrays",
"GLM",
"Statistics",
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
base_dir = dirname(pwd())
data_directory = joinpath(base_dir, "data")
results_dir = mkpath(joinpath(base_dir, "results"))

In [None]:
sample_directories = readdir(joinpath(data_directory, "samples"), join=true)

In [None]:
kraken_dbs = filter(x -> isdir(x), readdir("$(homedir())/workspace/kraken", join=true))

In [None]:
# kraken_db = "/home/cjprybol/workspace/kraken/k2_pluspfp_20231009"
kraken_db = "k2_pluspfp_20231009"

In [None]:
kraken_reports = String[]
for sample_directory in sample_directories
    trim_galore_directories = filter(x -> occursin(r"trimgalore$", x), readdir(sample_directory, join=true))
    for trim_galore_directory in trim_galore_directories
        # @show trim_galore_directory
        kraken_directories = filter(x -> occursin(r"kraken", x), readdir(trim_galore_directory, join=true))
        for kraken_directory in kraken_directories
            reports = filter(x -> occursin(r"kraken-report\.tsv$", x), readdir(kraken_directory, join=true))
            append!(kraken_reports, reports)
        end
    end
end
kraken_reports = filter(x -> occursin(kraken_db, x), kraken_reports)

In [None]:
taxon_levels = Mycelia.list_ranks()

In [None]:
# i = 1
# i = 2
# i = 3
# i = 4
# i = 5
# i = 6
# i = 7
# i = 8
i = 9

(taxon_index, taxon_level) = collect(enumerate(taxon_levels))[i]
println("$(taxon_index) - $(taxon_level)")
rank_table = Mycelia.list_rank(taxon_level)

In [None]:
# turn me into a function that accepts a rank table and a list of kraken reports as well as an output
rank_taxids = Set(rank_table[!, "taxid"])
cross_sample_taxon_report_table = DataFrames.DataFrame()
ProgressMeter.@showprogress for kraken_report in kraken_reports
    report_table = Mycelia.read_kraken_report(kraken_report)
    taxon_level_report = report_table[map(x -> x in rank_taxids, report_table[!, "ncbi_taxonid"]), :]
    taxon_level_report[!, "sample_identifier"] .= basename(kraken_report)
    append!(cross_sample_taxon_report_table, taxon_level_report)
end
cross_sample_taxon_report_table

In [None]:
# drop human reads
filtered_cross_sample_taxon_report = cross_sample_taxon_report_table[cross_sample_taxon_report_table[!, "ncbi_taxonid"] .!= 9606, :]

cross_sample_taxon_report_summary = filtered_cross_sample_taxon_report[!, 
    DataFrames.Not([
            "percentage_of_fragments_at_or_below_taxon",
            "number_of_fragments_assigned_directly_to_taxon",
            "rank"
        ])]
cross_sample_taxon_report_summary[!, "taxon"] = map(row -> string(row["ncbi_taxonid"]) * "_" * row["scientific_name"], DataFrames.eachrow(cross_sample_taxon_report_summary))
cross_sample_taxon_report_summary = cross_sample_taxon_report_summary[!, DataFrames.Not([
            "ncbi_taxonid",
            "scientific_name"
        ])]

# assert sortedness & uniqueness (should be a no-op)
cross_sample_taxon_report_summary[!, "participant"] = map(x -> join(split(x, '_')[1:2], "_"), cross_sample_taxon_report_summary[!, "sample_identifier"])
unique!(DataFrames.sort!(cross_sample_taxon_report_summary, ["sample_identifier", "taxon"]))


In [None]:
# KEEP FILTERS
# cross_sample_taxon_report_summary = cross_sample_taxon_report_summary[map(x -> occursin(r"^LO_", string(x)), cross_sample_taxon_report_summary[!, "participant"]), :]
# cross_sample_taxon_report_summary = cross_sample_taxon_report_summary[map(x -> occursin(r"^RI_", string(x)), cross_sample_taxon_report_summary[!, "participant"]), :]

# EXCLUDE FILTERS
# remove RA_PROB - all
cross_sample_taxon_report_summary = cross_sample_taxon_report_summary[map(x -> !occursin(r"^RA_PROB", string(x)), cross_sample_taxon_report_summary[!, "participant"]), :]

# remove second RI_3BRO_L2, not first
# filter(x -> occursin(r"\^RI_3BRO", x), sort(unique(cross_sample_taxon_report_summary[!, "sample_identifier"])))
cross_sample_taxon_report_summary = cross_sample_taxon_report_summary[map(x -> !occursin("RI_3BRO_CKDN230005692-1A_H5HYWDSX7_L2", string(x)), cross_sample_taxon_report_summary[!, "sample_identifier"]), :]

# remove RI_MGF
cross_sample_taxon_report_summary = cross_sample_taxon_report_summary[map(x -> !occursin(r"^RI_MGF", string(x)), cross_sample_taxon_report_summary[!, "participant"]), :]

In [None]:
# cross_sample_taxon_report_summary_summary = DataFrames.combine(
#     DataFrames.groupby(cross_sample_taxon_report_summary[!, DataFrames.Not("sample_identifier")], ["participant", "taxon"]),
#     "number_of_fragments_at_or_below_taxon" => sum)

# cross_sample_taxon_report_summary_summary[!, "participant"] = CategoricalArrays.categorical(cross_sample_taxon_report_summary_summary[!, "participant"])

# cross_sample_taxon_report_summary_summary[!, "taxon"] = CategoricalArrays.categorical(cross_sample_taxon_report_summary_summary[!, "taxon"])

# cross_sample_taxon_report_summary_summary

# always_missing_taxa = Set([])
# for g in DataFrames.groupby(cross_sample_taxon_report_summary_summary, "taxon")
#     if sum(g[!, "number_of_fragments_at_or_below_taxon_sum"]) == 0
#         push!(always_missing_taxa, g[1, "taxon"])
#     end
# end
# always_missing_taxa

# cross_sample_taxon_report_summary_summary = cross_sample_taxon_report_summary_summary[map(x -> !(x in always_missing_taxa), cross_sample_taxon_report_summary_summary[!, "taxon"]), :]

In [None]:
# cross_sample_taxon_report_summary_summary = cross_sample_taxon_report_summary_summary[map(x -> occursin(r"^LO_", string(x)), cross_sample_taxon_report_summary_summary[!, "participant"]), :]
# cross_sample_taxon_report_summary_summary = cross_sample_taxon_report_summary_summary[map(x -> occursin(r"^RI_", string(x)), cross_sample_taxon_report_summary_summary[!, "participant"]), :]

In [None]:
# sample_mask = map(x -> !occursin(r"^RA_PROB", x) && !occursin(r"^RI_MGF", x) && !occursin(r"^RI_3BRO_CKDN230005692-1A_H5HYWDSX7_L2", x), samples)
# filtered_values = values[sample_mask, :]
# filtered_samples = samples[sample_mask]
# normalized_values = filtered_values ./ sum(filtered_values, dims=2)

In [None]:
# model = GLM.lm(GLM.@formula(number_of_fragments_at_or_below_taxon_sum ~ participant + taxon), cross_sample_taxon_report_summary_summary)
# coeftable = GLM.coeftable(model)

In [None]:
# coeftable[!, coeftable.pvalcol]

In [None]:
# model = GLM.lm(GLM.@formula(number_of_fragments_at_or_below_taxon_sum ~ participant + taxon), cross_sample_taxon_report_summary_summary)
# coeftable = GLM.coeftable(model)

In [None]:
# pvalues = coeftable.cols[coeftable.pvalcol]
# pvalue_ordering = sortperm(pvalues)
# ordered_features = coeftable.rownms[pvalue_ordering]
# ordered_pvalues = pvalues[pvalue_ordering]
# ordered_pvalues .*= length(ordered_pvalues)

In [None]:
# is_signficant = ordered_pvalues .<= 0.001

In [None]:
# ordered_features = ordered_features[is_signficant]
# ordered_pvalues = ordered_pvalues[is_signficant]

In [None]:
# uCSV.write(
#     joinpath(results_dir, "significance_table.tsv"),
#     DataFrames.DataFrame(
#     feature = ordered_features,
#     pvalue = ordered_pvalues),
#     delim='\t')

In [None]:
# model

In [None]:
# filtered_cross_sample_taxon_report

In [None]:
# taxa = String[]
# samples = String[]
# n_samples = length(unique(cross_sample_taxon_report_summary[!, "sample_identifier"]))
# n_taxa = length(unique(cross_sample_taxon_report_summary[!, "taxon"]))
# values = zeros(n_samples, n_taxa)

taxa = sort(unique(cross_sample_taxon_report_summary[!, "taxon"]))
samples = sort(unique(cross_sample_taxon_report_summary[!, "sample_identifier"]))
values = zeros(length(samples), length(taxa))
ProgressMeter.@showprogress for (column_index, taxon_table) in enumerate(DataFrames.groupby(cross_sample_taxon_report_summary, "taxon"))
    taxon = taxon_table[1, "taxon"]
    # push!(taxa, taxon)
    # @show taxon
    @assert taxa[column_index] == taxon
    for (row_index, sample_table) in enumerate(DataFrames.groupby(taxon_table, "sample_identifier"))
        @assert DataFrames.nrow(sample_table) == 1
        row = sample_table[1, :]
        sample = row["sample_identifier"]
        # if column_index == 1
        #     push!(samples, sample)
        # else
        @assert samples[row_index] == sample
        # end
        values[row_index, column_index] = row["number_of_fragments_at_or_below_taxon"]
    end
end
values

In [None]:
# sort taxa so largest single sample taxa is first
taxa_frequency_ordering = sortperm(maximum.(eachcol(values)))
values = values[:, taxa_frequency_ordering]
taxa = taxa[taxa_frequency_ordering]
# find taxa that have no representation, and filter them out
taxa_is_detected = [sum(col) > 0 for col in eachcol(values)]
values = values[:, taxa_is_detected]
taxa = taxa[taxa_is_detected]
samples = string.(first.(split.(samples, '.')))
normalized_values = values ./ sum(values, dims=2)

In [None]:
# for x in reverse(taxa)[1:10]
#     println(x)
# end

In [None]:
# for (sample, row) in zip(samples, eachrow(values))
#     @show sample, taxa[last(findmax(row))]
# end

In [None]:
fit_pca = MultivariateStats.fit(MultivariateStats.PCA, normalized_values')

In [None]:
?MultivariateStats.transform

In [None]:
transformed_observations = MultivariateStats.transform(fit_pca, normalized_values')

In [None]:
# HSD vs not

In [None]:
# top_level_groups = unique(map(x -> join(split(x, '_')[1:2], '_'), samples))
top_level_groups = ["HSD", "non-HSD"]

In [None]:
HSD = [
    "RI_3BRO",
    "RI_PROB",
    "LO_PROB",
    "VA_PROB"
]

# colorscheme = Colors.distinguishable_colors(length(top_level_groups), [Colors.RGB(1,1,1), Colors.RGB(0,0,0)], dropseed=true)

In [None]:
# colorscheme = Colors.distinguishable_colors(length(top_level_groups), [Colors.RGB(1,1,1), Colors.RGB(0,0,0)], dropseed=true)
xs = [Float64[] for group in top_level_groups]
ys = [Float64[] for group in top_level_groups]
zs = [Float64[] for group in top_level_groups]
raw_xs = transformed_observations[1, :]
if size(transformed_observations, 1) >= 2
    raw_ys = transformed_observations[2, :]
else
    raw_ys = zeros(length(raw_xs))
end

if size(transformed_observations, 1) >= 3
    raw_zs = transformed_observations[3, :]
else
    raw_zs = zeros(length(raw_xs))
end


In [None]:
for (sample, x, y, z) in zip(samples, raw_xs, raw_ys, raw_zs)
    # @show sample, x, y
    # @show sample
    sample_group = 2
    for prefix in HSD
        if occursin(prefix, sample)
            # @show prefix, sample
            sample_group = 1
        end
    end
    # sample_groups = findall(x -> occursin(x, sample), top_level_groups)
    # @assert length(sample_groups) == 1
    # sample_group = first(sample_groups)
    push!(xs[sample_group], x)
    push!(ys[sample_group], y)
    push!(zs[sample_group], z)
end

In [None]:
plot = 
StatsPlots.scatter(
    xs,
    ys,
    # zs,
    xlabel = "PC1",
    ylabel = "PC2",
    # zlabel = "PC3",
    labels = hcat(top_level_groups...),
    title = "participant clustering after removing human reads and outlier samples",
    legend = :outertopright,
    margins = 20StatsPlots.px,
    # seriescolor = hcat(colorscheme...),
    dpi=300,
    size=(900, 600)
)

# display(plot)
# for extension in [".png"]
#     file = joinpath(results_dir, "taxonomic-breakdowns.kraken.$(kraken_db).$(taxon_index).$(taxon_level).outliers-removed.pca") * extension
#     StatsPlots.savefig(plot, file)
# end

In [None]:
println("Top 10 features in principle component 1")
pc1_table = DataFrames.DataFrame(
    taxa = taxa,
    weight = MultivariateStats.loadings(fit_pca)[:, 1]
)
sort!(pc1_table, "weight", rev=true)
pc1_table[1:10, :]

In [None]:
println("Top 10 features in principle component 2")
pc2_table = DataFrames.DataFrame(
    taxa = taxa,
    weight = MultivariateStats.loadings(fit_pca)[:, 2]
)
sort!(pc2_table, "weight", rev=true)
pc2_table[1:10, :]

In [None]:
# Returns the projection matrix (of size (d, p)). Each column of the projection matrix corresponds to a principal component. The principal components are arranged in descending order of the corresponding variances.
MultivariateStats.projection(fit_pca)

In [None]:
# Returns the ratio of variance preserved in the principal subspace, which is equal to tprincipalvar(M) / var(M).
MultivariateStats.principalratio(fit_pca)