# Mycelia Test Development Notebook
This notebook is for interactive development, debugging, and validation of test blocks from the Mycelia package.  
Copy and paste test blocks between this notebook and the corresponding test files (e.g., `test/1_data_acquisition/ncbi_download.jl`) to iterate quickly.  
Once tests are working as expected, move them back to the `.jl` files for automated testing.

In [None]:
import Pkg
Pkg.activate("..")

In [None]:
import Random
import Distributions
import MultivariateStats
import UMAP
import Mycelia
import Statistics
import LinearAlgebra
import Test
import Plots
import Clustering

In [None]:
# Set a random seed for reproducibility
Random.seed!(42)

In [None]:
# Parameters
n_distributions = 7      # Number of distributions
n_samples = 10      # Number of samples per distribution
n_features = 100     # Length of each distribution (number of features)

In [None]:
# Step 1: Generate N distributions (each is a vector of probabilities)
binary_probabilities = [rand(n_features) for _ in 1:n_distributions]  # Each element is a vector of length L with values in [0,1]

In [None]:
binary_samples = [hcat([rand.(Distributions.Bernoulli.(p)) for _ in 1:n_samples]...) for p in binary_probabilities]

In [None]:
binary_matrix = hcat(binary_samples...)

In [None]:
# Create a label vector: for each distribution, repeat its index X times
binary_labels = repeat(1:n_distributions, inner=n_samples)

In [None]:
# Shuffle columns and labels together
binary_perm = Random.shuffle(1:length(binary_labels))

In [None]:
shuffled_binary_matrix = binary_matrix[:, binary_perm]

In [None]:
shuffled_binary_labels = binary_labels[binary_perm]

In [None]:
logistic_epca_k = 3

In [None]:
result = Mycelia.logistic_pca_epca(shuffled_binary_matrix, k=logistic_epca_k)

In [None]:
Test.@test size(result.scores) == (logistic_epca_k, n_samples * n_distributions)

In [None]:
Test.@test size(result.loadings) == (logistic_epca_k, n_features)

In [None]:
# NOTE: logistic_epca was so bad standalone I didn't keep the visualization and clustering results

In [None]:
umap_model = Mycelia.umap_embed(result.scores)

In [None]:
Test.@test size(umap_model.embedding) == (2, n_samples * n_distributions)

In [None]:
# Fit k-means clustering
fit_labels = Clustering.kmeans(umap_model.embedding, n_distributions).assignments

In [None]:
plt = Mycelia.plot_embeddings(umap_model.embedding;
               title="Logistic PCA-EPCA - Binary Matrix - UMAP",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=shuffled_binary_labels,
               fit_labels=fit_labels)

In [None]:
binary_distance_matrix = Mycelia.frequency_matrix_to_jaccard_distance_matrix(shuffled_binary_matrix)

In [None]:
pcoa_result = Mycelia.pcoa_from_dist(binary_distance_matrix, maxoutdim=3)

In [None]:
# pcoa_result.coordinates

In [None]:
Test.@test size(pcoa_result.coordinates) == (size(pcoa_result.model.U, 2), n_samples * n_distributions)

In [None]:
# Fit k-means clustering
pcoa_fit_labels = Clustering.kmeans(pcoa_result.coordinates, n_distributions).assignments

In [None]:
plt = Mycelia.plot_embeddings(pcoa_result.coordinates;
               title="PCoA - Jaccard Distance - Binary Matrix",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=shuffled_binary_labels,
               fit_labels=pcoa_fit_labels
)

In [None]:
pcoa_result.coordinates

In [None]:
umap_model = Mycelia.umap_embed(pcoa_result.coordinates)

In [None]:
Test.@test size(umap_model.embedding) == (2, n_samples * n_distributions)

In [None]:
# Fit k-means clustering
fit_labels = Clustering.kmeans(umap_model.embedding, n_distributions).assignments

In [None]:
plt = Mycelia.plot_embeddings(umap_model.embedding;
               title="PCoA - Jaccard Distance - Binary Matrix - UMAP",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=shuffled_binary_labels,
               fit_labels=fit_labels)

In [None]:
# TODO - add plain vanilla PCA w and w/o UMAP for binary data

In [None]:
# TODO - add poisson count data vanilla PCA, glm_pca_epca, negbin_pca_epca, and bray_curtis_distance + PCoA w & w/o UMAP

In [None]:












# Parameters for Poisson λ distribution
λ_scale = 0.7         # Lower = more bias toward 0
λ_max = 128.0          # User-defined maximum (set to Inf for no max)

# Step 1: Generate N distributions (each is a vector of Poisson means, biased toward 0)
poisson_means = [
    clamp.(rand(Distributions.Exponential(λ_scale), L), 0, λ_max)
    for _ in 1:N
]

# Step 2: For each distribution, sample X count vectors (each column is a sample)
poisson_samples = [hcat([rand.(Distributions.Poisson.(λ)) for _ in 1:X]...) for λ in poisson_means]

# Concatenate all samples into one matrix (L x (N*X))
all_poisson_samples = hcat(poisson_samples...)

# Create a label vector: for each distribution, repeat its index X times
all_poisson_labels = repeat(1:N, inner=X)

# Shuffle columns and labels together
perm_poisson = Random.shuffle(1:size(all_poisson_samples, 2))
shuffled_poisson_samples = all_poisson_samples[:, perm_poisson]
shuffled_poisson_labels = all_poisson_labels[perm_poisson]




result = Mycelia.glm_pca_epca(M, k=5)
Test.@test size(result.scores) == (5, n_samples)
Test.@test size(result.loadings) == (5, n_features)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result.scores;
               title="GLM PCA-EPCA - Poisson Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=true_labels_poisson[(i-1)*n_samples+1:i*n_samples],
               fit_labels=fit_labels)
display(plt)


# Test with k specified
result_k = Mycelia.pca_transform(M, k=5)
Test.@test size(result_k.scores) == (5, n_samples)
Test.@test size(result_k.loadings) == (5, n_features)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result_k.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result_k.scores;
               title="PCA Transform (k=5) - Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=i <= length(p_values) ? true_labels_binary[1:n_samples] : true_labels_poisson[1:n_samples],
               fit_labels=fit_labels)
display(plt)

# Test with var_prop specified
result_var = Mycelia.pca_transform(M, var_prop=0.95)
Test.@test size(result_var.scores, 1) <= n_features
Test.@test size(result_var.loadings, 1) == size(result_var.scores, 1)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result_var.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result_var.scores;
               title="PCA Transform (var_prop=0.95) - Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=i <= length(p_values) ? true_labels_binary[1:n_samples] : true_labels_poisson[1:n_samples],
               fit_labels=fit_labels)
display(plt)


# Test negbin_pca_epca with poisson matrices

result = Mycelia.negbin_pca_epca(M, k=5, r=2)
Test.@test size(result.scores) == (5, n_samples)
Test.@test size(result.loadings) == (5, n_features)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result.scores', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result.scores;
               title="Negative Binomial PCA-EPCA - Poisson Matrix $i",
               xlabel="PC1",
               ylabel="PC2",
               true_labels=true_labels_poisson[(i-1)*n_samples+1:i*n_samples],
               fit_labels=fit_labels)
display(plt)



D = bray_curtis_distance(M)
result = Mycelia.pcoa_from_dist(D, maxoutdim=2)
Test.@test size(result.coordinates) == (2, n_samples)

# Fit k-means clustering
fit_labels = Clustering.kmeans(result.coordinates', 3).assignments

# Plot embeddings
plt = Mycelia.plot_embeddings(result.coordinates;
               title="PCoA - Bray-Curtis Distance - Poisson Matrix $i",
               xlabel="Coordinate 1",
               ylabel="Coordinate 2",
               true_labels=true_labels_poisson[(i-1)*n_samples+1:i*n_samples],
               fit_labels=fit_labels)
display(plt)

