# Mycelia Test Development Notebook
This notebook is for interactive development, debugging, and validation of test blocks from the Mycelia package.  
Copy and paste test blocks between this notebook and the corresponding test files (e.g., `test/1_data_acquisition/ncbi_download.jl`) to iterate quickly.  
Once tests are working as expected, move them back to the `.jl` files for automated testing.

In [None]:
import Pkg
Pkg.activate(".")

In [None]:
using Revise
using Test

In [None]:
import Mycelia

In [None]:
import Mycelia
import Random
import Distributions
import MultivariateStats
import UMAP
import Statistics
import LinearAlgebra
import Plots
import Clustering

In [None]:
@testset "Negative Binomial (overdispersed counts) Matrix Processing" begin
    # Set a random seed for reproducibility
    Random.seed!(42)

    # Parameters
    n_distributions = 7      # Number of distributions
    n_samples = 10      # Number of samples per distribution
    n_features = 100     # Length of each distribution (number of features)

    # Negative Binomial (overdispersed counts)
    nb_r = 5  # dispersion parameter
    nb_ps = [rand(0.2:0.05:0.8, n_features) for _ in 1:n_distributions]
    nb_samples = [hcat([rand.(Distributions.NegativeBinomial(nb_r, p)) for _ in 1:n_samples]...) for p in nb_ps]
    nb_matrix = hcat(nb_samples...)
    nb_labels = repeat(1:n_distributions, inner=n_samples)
    perm = Random.shuffle(1:length(nb_labels))
    shuffled_nb_matrix = nb_matrix[:, perm]
    shuffled_nb_labels = nb_labels[perm]

    @testset "Sanity Check - Negative Binomial Matrix" begin
        summary = Mycelia.sanity_check_matrix(shuffled_nb_matrix)
        @test summary[:n_features] == n_features
        @test summary[:n_samples] == n_samples * n_distributions
        @test summary[:is_integer] == true
        @test summary[:is_nonnegative] == true
        @test summary[:is_binary] == false
        @test summary[:is_strictly_positive] == false
        @test summary[:is_in_01] == false
        @test summary[:is_probability_vector] == false
        @test summary[:suggested_epca] == :negbin_pca_epca
        @test summary[:suggested_distance] == :bray_curtis_distance
    end
end

@testset "Binomial (counts in 0:ntrials) Matrix Processing" begin
    # Set a random seed for reproducibility
    Random.seed!(42)

    # Parameters
    n_distributions = 7      # Number of distributions
    n_samples = 10      # Number of samples per distribution
    n_features = 100     # Length of each distribution (number of features)

    # Binomial (counts in 0:ntrials)
    ntrials = 10
    binom_ps = [rand(n_features) for _ in 1:n_distributions]
    binom_samples = [hcat([rand.(Distributions.Binomial(ntrials, p)) for _ in 1:n_samples]...) for p in binom_ps]
    binom_matrix = hcat(binom_samples...)
    binom_labels = repeat(1:n_distributions, inner=n_samples)
    perm = Random.shuffle(1:length(binom_labels))
    shuffled_binom_matrix = binom_matrix[:, perm]
    shuffled_binom_labels = binom_labels[perm]

    @testset "Sanity Check - Binomial Matrix" begin
        summary = Mycelia.sanity_check_matrix(shuffled_binom_matrix)
        @test summary[:n_features] == n_features
        @test summary[:n_samples] == n_samples * n_distributions
        @test summary[:is_integer] == true
        @test summary[:is_nonnegative] == true
        @test summary[:is_binary] == false
        @test summary[:is_strictly_positive] == false
        @test summary[:is_in_01] == false
        @test summary[:is_probability_vector] == false
        # Binomial is not strictly binary, so suggested_epca should be poisson or negbin
        @test summary[:suggested_epca] == :poisson_pca_epca || summary[:suggested_epca] == :negbin_pca_epca
        @test summary[:suggested_distance] == :bray_curtis_distance
    end
end

@testset "Continuous Bernoulli (values in (0,1)) Matrix Processing" begin
    # Set a random seed for reproducibility
    Random.seed!(42)

    # Parameters
    n_distributions = 7      # Number of distributions
    n_samples = 10      # Number of samples per distribution
    n_features = 100     # Length of each distribution (number of features)

    # Continuous Bernoulli (values in (0,1))
    contb_ps = [rand(n_features) for _ in 1:n_distributions]
    contb_samples = [hcat([rand.(Distributions.Beta.(p*0.9 .+ 0.05, (1 .- p)*0.9 .+ 0.05)) for _ in 1:n_samples]...) for p in contb_ps]
    contb_matrix = hcat(contb_samples...)
    contb_labels = repeat(1:n_distributions, inner=n_samples)
    perm = Random.shuffle(1:length(contb_labels))
    shuffled_contb_matrix = contb_matrix[:, perm]
    shuffled_contb_labels = contb_labels[perm]

    @testset "Sanity Check - Continuous Bernoulli Matrix" begin
        summary = Mycelia.sanity_check_matrix(shuffled_contb_matrix)
        @test summary[:n_features] == n_features
        @test summary[:n_samples] == n_samples * n_distributions
        @test summary[:is_integer] == false
        @test summary[:is_nonnegative] == true
        @test summary[:is_binary] == false
        @test summary[:is_strictly_positive] == true
        @test summary[:is_in_01] == true
        @test summary[:is_probability_vector] == false
        @test summary[:suggested_epca] == :contbernoulli_pca_epca
        @test summary[:suggested_distance] == :cosine_distance
    end
end

@testset "Gamma (strictly positive) Matrix Processing" begin
    # Set a random seed for reproducibility
    Random.seed!(42)

    # Parameters
    n_distributions = 7      # Number of distributions
    n_samples = 10      # Number of samples per distribution
    n_features = 100     # Length of each distribution (number of features)

    # Gamma (strictly positive)
    gamma_shapes = [rand(1.0:0.5:5.0, n_features) for _ in 1:n_distributions]
    gamma_scales = [rand(1.0:0.5:3.0, n_features) for _ in 1:n_distributions]
    gamma_samples = [hcat([rand.(Distributions.Gamma.(sh, sc)) for _ in 1:n_samples]...) for (sh, sc) in zip(gamma_shapes, gamma_scales)]
    gamma_matrix = hcat(gamma_samples...)
    gamma_labels = repeat(1:n_distributions, inner=n_samples)
    perm = Random.shuffle(1:length(gamma_labels))
    shuffled_gamma_matrix = gamma_matrix[:, perm]
    shuffled_gamma_labels = gamma_labels[perm]

    @testset "Sanity Check - Gamma Matrix" begin
        summary = Mycelia.sanity_check_matrix(shuffled_gamma_matrix)
        @test summary[:n_features] == n_features
        @test summary[:n_samples] == n_samples * n_distributions
        @test summary[:is_integer] == false
        @test summary[:is_nonnegative] == true
        @test summary[:is_binary] == false
        @test summary[:is_strictly_positive] == true
        @test summary[:is_in_01] == false
        @test summary[:is_probability_vector] == false
        @test summary[:suggested_epca] == :gamma_pca_epca
        @test summary[:suggested_distance] == :cosine_distance
    end
end

@testset "Gaussian (centered, real-valued) Matrix Processing" begin
    # Set a random seed for reproducibility
    Random.seed!(42)

    # Parameters
    n_distributions = 7      # Number of distributions
    n_samples = 10      # Number of samples per distribution
    n_features = 100     # Length of each distribution (number of features)

    # Gaussian (centered, real-valued)
    gauss_means = [randn(n_features) for _ in 1:n_distributions]
    gauss_stds = [rand(0.5:0.1:2.0, n_features) for _ in 1:n_distributions]
    gauss_samples = [hcat([rand.(Distributions.Normal.(μ, σ)) for _ in 1:n_samples]...) for (μ, σ) in zip(gauss_means, gauss_stds)]
    gauss_matrix = hcat(gauss_samples...)
    gauss_labels = repeat(1:n_distributions, inner=n_samples)
    perm = Random.shuffle(1:length(gauss_labels))
    shuffled_gauss_matrix = gauss_matrix[:, perm]
    shuffled_gauss_labels = gauss_labels[perm]

    @testset "Sanity Check - Gaussian Matrix" begin
        summary = Mycelia.sanity_check_matrix(shuffled_gauss_matrix)
        @test summary[:n_features] == n_features
        @test summary[:n_samples] == n_samples * n_distributions
        @test summary[:is_integer] == false
        @test summary[:is_binary] == false
        # Gaussian can have negative values, so is_nonnegative and is_strictly_positive are likely false
        @test summary[:is_nonnegative] == false
        @test summary[:is_strictly_positive] == false
        @test summary[:is_in_01] == false
        @test summary[:is_probability_vector] == false
        @test summary[:suggested_epca] == :gaussian_pca_epca || summary[:suggested_epca] == :pca_transform
        @test summary[:suggested_distance] == :euclidean_distance
    end
end

@testset "Probability Vector (Compositional) Matrix Processing" begin
    # Set a random seed for reproducibility
    Random.seed!(42)

    # Parameters
    n_distributions = 7      # Number of distributions
    n_samples = 10           # Number of samples per distribution
    n_features = 100         # Length of each distribution (number of features)

    # Dirichlet (probability vectors: non-negative, sum to 1)
    dirichlet_alphas = [rand(0.5:0.1:2.0, n_features) for _ in 1:n_distributions]
    dirichlet_samples = [hcat([rand(Distributions.Dirichlet(alpha)) for _ in 1:n_samples]...) for alpha in dirichlet_alphas]
    dirichlet_matrix = hcat(dirichlet_samples...)
    dirichlet_labels = repeat(1:n_distributions, inner=n_samples)
    perm = Random.shuffle(1:length(dirichlet_labels))
    shuffled_dirichlet_matrix = dirichlet_matrix[:, perm]
    shuffled_dirichlet_labels = dirichlet_labels[perm]

    @testset "Sanity Check - Probability Vector Matrix" begin
        summary = Mycelia.sanity_check_matrix(shuffled_dirichlet_matrix)
        @test summary[:n_features] == n_features
        @test summary[:n_samples] == n_samples * n_distributions
        @test summary[:is_integer] == false
        @test summary[:is_nonnegative] == true
        @test summary[:is_binary] == false
        @test summary[:is_strictly_positive] == true
        @test summary[:is_in_01] == true
        @test summary[:is_probability_vector] == true
        # Each column should sum to 1 (within tolerance)
        @test all(abs.(sum(shuffled_dirichlet_matrix, dims=1) .- 1) .< 1e-8)
        # No direct EPCA for compositional/probability data
        @test summary[:suggested_epca] === nothing
        @test summary[:suggested_distance] == :jensen_shannon_divergence
    end
end

In [None]:
# Poisson (counts)
poisson_lambdas = [rand(1:10, n_features) for _ in 1:n_distributions]
poisson_samples = [hcat([rand.(Distributions.Poisson.(λ)) for _ in 1:n_samples]...) for λ in poisson_lambdas]
poisson_matrix = hcat(poisson_samples...)
poisson_labels = repeat(1:n_distributions, inner=n_samples)
perm = Random.shuffle(1:length(poisson_labels))
shuffled_poisson_matrix = poisson_matrix[:, perm]
shuffled_poisson_labels = poisson_labels[perm]

In [None]:
# Negative Binomial (overdispersed counts)
nb_r = 5  # dispersion parameter
nb_ps = [rand(0.2:0.05:0.8, n_features) for _ in 1:n_distributions]
nb_samples = [hcat([rand.(Distributions.NegativeBinomial(nb_r, p)) for _ in 1:n_samples]...) for p in nb_ps]
nb_matrix = hcat(nb_samples...)
nb_labels = repeat(1:n_distributions, inner=n_samples)
perm = Random.shuffle(1:length(nb_labels))
shuffled_nb_matrix = nb_matrix[:, perm]
shuffled_nb_labels = nb_labels[perm]

In [None]:
# Binomial (counts in 0:ntrials)
ntrials = 10
binom_ps = [rand(n_features) for _ in 1:n_distributions]
binom_samples = [hcat([rand.(Distributions.Binomial(ntrials, p)) for _ in 1:n_samples]...) for p in binom_ps]
binom_matrix = hcat(binom_samples...)
binom_labels = repeat(1:n_distributions, inner=n_samples)
perm = Random.shuffle(1:length(binom_labels))
shuffled_binom_matrix = binom_matrix[:, perm]
shuffled_binom_labels = binom_labels[perm]

In [None]:
# Continuous Bernoulli (values in (0,1))
contb_ps = [rand(n_features) for _ in 1:n_distributions]
contb_samples = [hcat([rand.(Distributions.Beta.(p*0.9 .+ 0.05, (1 .- p)*0.9 .+ 0.05)) for _ in 1:n_samples]...) for p in contb_ps]
contb_matrix = hcat(contb_samples...)
contb_labels = repeat(1:n_distributions, inner=n_samples)
perm = Random.shuffle(1:length(contb_labels))
shuffled_contb_matrix = contb_matrix[:, perm]
shuffled_contb_labels = contb_labels[perm]

In [None]:
# Gamma (strictly positive)
gamma_shapes = [rand(1.0:0.5:5.0, n_features) for _ in 1:n_distributions]
gamma_scales = [rand(1.0:0.5:3.0, n_features) for _ in 1:n_distributions]
gamma_samples = [hcat([rand.(Distributions.Gamma.(sh, sc)) for _ in 1:n_samples]...) for (sh, sc) in zip(gamma_shapes, gamma_scales)]
gamma_matrix = hcat(gamma_samples...)
gamma_labels = repeat(1:n_distributions, inner=n_samples)
perm = Random.shuffle(1:length(gamma_labels))
shuffled_gamma_matrix = gamma_matrix[:, perm]
shuffled_gamma_labels = gamma_labels[perm]

In [None]:
# Gaussian (centered, real-valued)
gauss_means = [randn(n_features) for _ in 1:n_distributions]
gauss_stds = [rand(0.5:0.1:2.0, n_features) for _ in 1:n_distributions]
gauss_samples = [hcat([rand.(Distributions.Normal.(μ, σ)) for _ in 1:n_samples]...) for (μ, σ) in zip(gauss_means, gauss_stds)]
gauss_matrix = hcat(gauss_samples...)
gauss_labels = repeat(1:n_distributions, inner=n_samples)
perm = Random.shuffle(1:length(gauss_labels))
shuffled_gauss_matrix = gauss_matrix[:, perm]
shuffled_gauss_labels = gauss_labels[perm]

In [None]:
# Counts

In [None]:
# Relative probabilities

In [None]:
?Mycelia.pca_transform

In [None]:
Mycelia.pca

In [None]:
# TODO - add plain vanilla PCA w and w/o UMAP for binary data
pca_result = Mycelia.pca_transform(
    shuffled_binary_matrix
)

In [None]:
pca_result.scores

In [None]:
# TODO - add poisson count data vanilla PCA, glm_pca_epca, negbin_pca_epca, and bray_curtis_distance + PCoA w & w/o UMAP

In [None]:












# Parameters for Poisson λ distribution
λ_scale = 0.7         # Lower = more bias toward 0
λ_max = 128.0          # User-defined maximum (set to Inf for no max)

# Step 1: Generate N distributions (each is a vector of Poisson means, biased toward 0)
poisson_means = [
    clamp.(rand(Distributions.Exponential(λ_scale), L), 0, λ_max)
    for _ in 1:N
]

# Step 2: For each distribution, sample X count vectors (each column is a sample)
poisson_samples = [hcat([rand.(Distributions.Poisson.(λ)) for _ in 1:X]...) for λ in poisson_means]

# Concatenate all samples into one matrix (L x (N*X))
all_poisson_samples = hcat(poisson_samples...)

# Create a label vector: for each distribution, repeat its index X times
all_poisson_labels = repeat(1:N, inner=X)

# Shuffle columns and labels together
perm_poisson = Random.shuffle(1:size(all_poisson_samples, 2))
shuffled_poisson_samples = all_poisson_samples[:, perm_poisson]
shuffled_poisson_labels = all_poisson_labels[perm_poisson]




result = Mycelia.glm_pca_epca(M, k=5)
Test.@test size(result.scores) == (5, n_samples)
Test.@test size(result.loadings) == (5, n_features)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result.scores;
               title="GLM PCA-EPCA - Poisson Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=true_labels_poisson[(i-1)*n_samples+1:i*n_samples],
               fit_labels=fit_labels)
display(plt)


# Test with k specified
result_k = Mycelia.pca_transform(M, k=5)
Test.@test size(result_k.scores) == (5, n_samples)
Test.@test size(result_k.loadings) == (5, n_features)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result_k.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result_k.scores;
               title="PCA Transform (k=5) - Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=i <= length(p_values) ? true_labels_binary[1:n_samples] : true_labels_poisson[1:n_samples],
               fit_labels=fit_labels)
display(plt)

# Test with var_prop specified
result_var = Mycelia.pca_transform(M, var_prop=0.95)
Test.@test size(result_var.scores, 1) <= n_features
Test.@test size(result_var.loadings, 1) == size(result_var.scores, 1)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result_var.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result_var.scores;
               title="PCA Transform (var_prop=0.95) - Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=i <= length(p_values) ? true_labels_binary[1:n_samples] : true_labels_poisson[1:n_samples],
               fit_labels=fit_labels)
display(plt)


# Test negbin_pca_epca with poisson matrices

result = Mycelia.negbin_pca_epca(M, k=5, r=2)
Test.@test size(result.scores) == (5, n_samples)
Test.@test size(result.loadings) == (5, n_features)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result.scores;
               title="Negative Binomial PCA-EPCA - Poisson Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=true_labels_poisson[(i-1)*n_samples+1:i*n_samples],
               fit_labels=fit_labels)
display(plt)



D = bray_curtis_distance(M)
result = Mycelia.pcoa_from_dist(D, maxoutdim=2)
Test.@test size(result.coordinates) == (2, n_samples)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result.coordinates', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result.coordinates;
               title="PCoA - Bray-Curtis Distance - Poisson Matrix $i",
               xlabel="Coordinate 1",
               ylabel="Coordinate 2",
               true_labels=true_labels_poisson[(i-1)*n_samples+1:i*n_samples],
               fit_labels=fit_labels)
display(plt)

