In [1]:
TODAY="2021-11-13"
TASK = "phylogenetic-determination"
DIR = "$(homedir())/$(TODAY)-$(TASK)"
if !isdir(DIR)
    mkdir(DIR)
end
cd(DIR)

In [2]:
import Pkg

pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"Plots",
"StatsPlots",
"StatsBase",
"Statistics",
"Mmap",
"MultivariateStats",
"PyCall",
"Random",
"Primes",
"Revise",
"SparseArrays",
"SHA",
"Mycelia",
"GenomicAnnotations",
"BioFetch",
"Combinatorics",
"StaticArrays",
"BioSymbols",
"RollingFunctions",
"OrderedCollections"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

In [3]:
function generate_all_possible_kmers(k, alphabet)
    kmer_iterator = Iterators.product([alphabet for i in 1:k]...)
    kmer_vectors = collect.(vec(collect(kmer_iterator)))
    if eltype(alphabet) == BioSymbols.AminoAcid
        kmers = BioSequences.LongAminoAcidSeq.(kmer_vectors)
    elseif eltype(alphabet) == BioSymbols.DNA
        kmers = BioSequences.LongDNASeq.(kmer_vectors)
    else
        error()
    end
    return sort!(kmers)
end

function generate_all_possible_canonical_kmers(k, alphabet)
    kmers = generate_all_possible_kmers(k, alphabet)
    if eltype(alphabet) == BioSymbols.AminoAcid
        return kmers
    elseif eltype(alphabet) == BioSymbols.DNA
        return BioSequences.DNAMer.(unique!(BioSequences.canonical.(kmers)))
    else
        error()
    end
end

generate_all_possible_canonical_kmers (generic function with 1 method)

In [4]:
function count_canonical_aamers(k, fasta_proteins)
    aamer_counts = OrderedCollections.OrderedDict{BioSequences.LongAminoAcidSeq, Int64}()
    for protein in fasta_proteins
        s = FASTX.sequence(protein)
        these_counts = sort(StatsBase.countmap([s[i:i+k-1] for i in 1:length(s)-k-1]))
        merge!(+, aamer_counts, these_counts)
    end
    return sort(aamer_counts)
end

count_canonical_aamers (generic function with 1 method)

In [5]:
function update_counts_matrix!(matrix, sample_index, countmap, sorted_kmers)
    for (i, kmer) in enumerate(sorted_kmers)
        matrix[i, sample_index] = get(countmap, kmer, 0)
    end
    return matrix
end

update_counts_matrix! (generic function with 1 method)

```julia
get_sequence(; db, accession, ftp) -> Union{Nothing, FASTX.FASTA.Reader}

```

Get dna (db = "nuccore") or protein (db = "protein") sequences from NCBI or get fasta directly from FTP site

```jldoctest
julia> 1 + 1
2
```


In [83]:
function accession_list_to_aamer_counts_table(accession_list, k, AA_ALPHABET)
    
    canonical_aamers = generate_all_possible_canonical_kmers(k, AA_ALPHABET)
    
    aamer_counts_matrix = zeros(length(canonical_aamers), length(accession_list))
    
    ProgressMeter.@showprogress for (entity_index, accession) in enumerate(accession_list)
#         entity_genbank = BioFetch.fetchseq(accession, format = BioFetch.gb)
        open("$(accession).fna", "w") do io
            fastx_io = FASTX.FASTA.Writer(io)
#             for chromosome in entity_genbank
#                 write(fastx_io, FASTX.FASTA.Record(chromosome.name, chromosome.sequence))
#             end
            for record in Mycelia.get_sequence(db="nuccore", accession = accession)
                write(fastx_io, record)
            end
            close(fastx_io)
        end
#         run(`prodigal -i $(accession).fna -o $(accession).fna.genes -a $(accession).fna.faa -p meta`)
        try
            run(pipeline(`prodigal -i $(accession).fna -o $(accession).fna.genes -a $(accession).fna.faa -p meta`, stderr="$(accession).fna.prodigal.stderr"))
            entity_aamer_counts = count_canonical_aamers(aa_k, collect(FASTX.FASTA.Reader(open("$(accession).fna.faa"))))
#         fasta_proteins = Vector{FASTX.FASTA.Record}()
#         for gene in GenomicAnnotations.@genes(entity_genbank, CDS)
#             try
#                 dna_seq = GenomicAnnotations.sequence(gene)
#                 # if seq isn't divisible by 3, cut final bases that can't be mapped to a codon
#                 translateable_length = div(length(dna_seq), 3) * 3
#                 truncated_dna_seq = dna_seq[1:translateable_length]
#                 aa_seq = BioSequences.translate(truncated_dna_seq)
#                 id = getproperty(gene, :protein_id)
#                 product = getproperty(gene, :product)
#                 record = FASTX.FASTA.Record(id, product, aa_seq)
#                 push!(fasta_proteins, record)
#             catch
# #                 @error GenomicAnnotations.sequence(gene)
# #                 @error GenomicAnnotations.sequence(gene)
#                 continue
# #                 error()
#             end
#         end
            update_counts_matrix!(aamer_counts_matrix, entity_index, entity_aamer_counts, canonical_aamers)
        catch
            println("problem with accession $accession")
            @show isempty.(FASTX.sequence.(fasta))
            @show fasta
        end
    end
    return aamer_counts_matrix
end

accession_list_to_aamer_counts_table (generic function with 1 method)

In [70]:
function accession_list_to_dnamer_counts_table(accession_list, k)
    canonical_dnamers = generate_all_possible_canonical_kmers(k, DNA_ALPHABET)
    dnamer_counts_matrix = zeros(length(canonical_dnamers), length(accession_list))

    ProgressMeter.@showprogress for (entity_index, accession) in enumerate(accession_list)
#         entity_genbank = BioFetch.fetchseq(accession, format = BioFetch.gb)
        fasta_dna_sequences = collect(Mycelia.get_sequence(db="nuccore", accession = accession))
#         fasta_dna_sequences = [
#             FASTX.FASTA.Record(chromosome.name, chromosome.sequence) 
#                 for chromosome in entity_genbank
#                               ]
        entity_dnamer_counts = Mycelia.count_canonical_kmers(BioSequences.DNAMer{dna_k}, fasta_dna_sequences)
        update_counts_matrix!(dnamer_counts_matrix, entity_index, entity_dnamer_counts, canonical_dnamers)
    end
    return dnamer_counts_matrix    
end

accession_list_to_dnamer_counts_table (generic function with 1 method)

In [8]:
function normalize_distance_matrix(distance_matrix)
    max_non_nan_value = maximum(filter(x -> !isnan(x) && !isnothing(x) && !ismissing(x), vec(distance_matrix)))
    return distance_matrix ./ max_non_nan_value
end

normalize_distance_matrix (generic function with 1 method)

In [9]:
function count_matrix_to_probability_matrix(counts_matrix)
    probability_matrix = copy(counts_matrix)
    for (i, col) in enumerate(eachcol(probability_matrix))
        probability_matrix[:, i] .= col ./ sum(col)
    end
    return probability_matrix
end

count_matrix_to_probability_matrix (generic function with 1 method)

In [10]:
# MYCELIA_METADATA = joinpath(Pkg.dir("Mycelia"), "metadata")
MYCELIA_METADATA = joinpath(dirname(dirname(pathof(Mycelia))),  "metadata")

"/home/jupyter-cjprybol/.julia/dev/Mycelia/metadata"

In [11]:
AA_ALPHABET = collect(filter(x -> x != BioSequences.AA_Term, Mycelia.AA_ALPHABET))
DNA_ALPHABET = Mycelia.DNA_ALPHABET

(DNA_A, DNA_C, DNA_G, DNA_T)

- Jaccard doesn't seem to differentiate sufficiently between sequences without getting into really large k sizes
- Euclidean is consistently seperating within vs between @ 10 samples of family
- JS and Cosine have some entities that are going the wrong way regarding distance is less between families

In [46]:
# n_samples = 1
# n_samples = 10
n_samples = 100
# n_samples = 1000

100

In [47]:
# https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Bacteriophage,%20all%20taxids&Completeness_s=complete
entity_metadata = DataFrames.DataFrame(uCSV.read("$(MYCELIA_METADATA)/2021-11-13-ncbi-complete-bacteriophage.csv", header=1, quotes='"')...)
entity_metadata = entity_metadata[entity_metadata[!, "Nuc_Completeness"] .== "complete", :]
entity_metadata = entity_metadata[entity_metadata[!, "Sequence_Type"] .== "RefSeq", :]
sort!(entity_metadata, "Accession")
Random.seed!(0)
subset_indices = StatsBase.sample(1:size(entity_metadata, 1), n_samples, ordered=true, replace=false)
entity_metadata = entity_metadata[subset_indices, :]

Unnamed: 0_level_0,Accession,SRA_Accession,Submitters
Unnamed: 0_level_1,String,String,String
1,NC_000935.1,,"van der Wilk,F., Dullemans,A.M., Verbeek,M., van den Heuvel,J.F., van den Heuvel,J.F.J.M."
2,NC_001332.1,,"Stassen,A.P., Schoenmakers,E.F., Yu,M., Schoenmakers,J.G., Konings,R.N., Konings,R.N.H."
3,NC_003387.1,,"Ford,M.E., Stenstrom,C., Hendrix,R.W., Hatfull,G.F."
4,NC_005083.2,,"Miller,E.S., Heidelberg,J.F., Eisen,J.A., Nelson,W.C., Durkin,A.S., Ciecko,A., Feldblyum,T.V., White,O., Paulsen,I.T., Nierman,W.C., Lee,J., Szczypinski,B., Fraser,C.M., Miller,E., Heidelberg,J., Eisen,J., Nelson,W., Durkin,A., Feldblyum,T., Paulsen,I., Nierman,W., Fraser,C."
5,NC_007023.1,,"Petrov,V.M., Nolan,J.M., Chin,D., Krisch,H.M., Karam,J.D."
6,NC_008198.1,,"Hatfull,G.F., Pedulla,M.L., Jacobs-Sera,D., Cichon,P.M., Foley,A., Ford,M.E., Gonda,R.M., Houtz,J.M., Hryckowian,A.J., Kelchner,V.A., Namburi,S., Pajcini,K.V., Popovich,M.G., Schleicher,D.T., Simanek,B.Z., Smith,A.L., Zdanowicz,G.M., Kumar,V., Peebles,C.L., Jacobs,W.R. Jr., Lawrence,J.G., Hendrix,R.W."
7,NC_011107.1,,"Glonti,T., Lingohr,E.J., Kropinski,A.M., Chanishvili,N."
8,NC_014635.1,,"Kim,J.H., Son,J.S., Choi,Y.J., Choresca,C.H., Shin,S.P., Han,J.E., Jun,J.W., Park,S.C., Kim,K.S."
9,NC_015292.1,,"Muller,I., Kube,M., Reinhardt,R., Jelkmann,W., Geider,K., Mueller,I."
10,NC_015294.2,,"Henry,M., Lavigne,R., Debarbieux,L., Leduc,D., Maura,D., Morello,E., Criscuolo,A., Grossi,O., Balloy,V., Touqui,L."


In [66]:
accession_list = entity_metadata[!, "Accession"]

100-element Vector{String}:
 "NC_000935.1"
 "NC_001332.1"
 "NC_003387.1"
 "NC_005083.2"
 "NC_007023.1"
 "NC_008198.1"
 "NC_011107.1"
 "NC_014635.1"
 "NC_015292.1"
 "NC_015294.2"
 "NC_016071.1"
 "NC_016655.1"
 "NC_018285.1"
 ⋮
 "NC_052661.1"
 "NC_052981.1"
 "NC_053237.1"
 "NC_054777.1"
 "NC_054890.1"
 "NC_054940.1"
 "NC_054942.1"
 "NC_055031.1"
 "NC_055040.1"
 "NC_055827.1"
 "NC_055843.1"
 "NC_055912.1"

In [49]:
# these are too small, all of the within vs between have some disagreement
# dna_k = 5
# aa_k = 2
dna_k = 7
aa_k = 3

3

In [50]:
# http://leb.snu.ac.kr/ezaai

In [51]:
# rep_origin      complement(16763)
# variation       complement(2665)
#      misc_feature    complement(3871)

#      variation       complement(3939)
#      variation       complement(20826)
#      modified_base   complement(29281)
#      modified_base   complement(29282)
#      modified_base   complement(29295)
#      modified_base   complement(31645)
#      modified_base   complement(31646)
#      modified_base   complement(31669)
#      modified_base   complement(31671)
#      modified_base   complement(31689)
#      modified_base   complement(31691)
#      modified_base   complement(31692)
#      modified_base   complement(32533)
#      modified_base   complement(32534)
#      modified_base   complement(32542)
#      modified_base   complement(32551)
#      modified_base   complement(32568)
#      modified_base   complement(33589)
#      modified_base   complement(33599)
#      modified_base   complement(33600)
#      modified_base   complement(33973)
#      modified_base   complement(33974)
#      modified_base   complement(33982)
#      modified_base   complement(33988)
#      modified_base   complement(33991)
#      modified_base   complement(34009)
#      variation       complement(49332)
#      variation       complement(50257)
#      variation       complement(51626)
#      variation       complement(92822)
#      variation       complement(101898)
#      variation       complement(102426)
#      variation       complement(105008)
#      variation       complement(107430)


In [52]:
# looking for columns that don't have any counts

In [53]:
# run(`sudo conda install -c bioconda prodigal`)

In [84]:
aamer_counts_matrix = accession_list_to_aamer_counts_table(accession_list, aa_k, AA_ALPHABET)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:29[39m


10648×100 Matrix{Float64}:
 9.0  2.0  56.0  32.0  19.0  24.0  24.0  …  11.0  10.0  1.0  22.0  53.0  44.0
 6.0  0.0  24.0  21.0  10.0  18.0   7.0     12.0   7.0  1.0  13.0  21.0  34.0
 4.0  0.0   2.0  13.0  18.0   5.0   5.0     11.0   3.0  0.0   6.0   9.0   6.0
 4.0  1.0  18.0  18.0  14.0  15.0   6.0     11.0   7.0  3.0  14.0  12.0  15.0
 1.0  0.0   1.0   4.0   1.0   2.0   1.0      2.0   2.0  1.0   3.0   3.0   2.0
 6.0  0.0   9.0  20.0   5.0  16.0  11.0  …  23.0   3.0  3.0  16.0  14.0  17.0
 4.0  1.0  26.0  27.0  14.0  13.0  11.0     21.0   9.0  2.0   8.0  19.0  14.0
 8.0  0.0  15.0  29.0  18.0  18.0  19.0     18.0  12.0  4.0  26.0  42.0  31.0
 2.0  0.0   5.0   6.0   3.0   2.0   4.0      2.0   0.0  1.0   3.0   5.0   8.0
 4.0  1.0  23.0  34.0  22.0   6.0   5.0     21.0   7.0  3.0  10.0  16.0  10.0
 7.0  1.0  25.0  38.0  24.0  14.0  22.0  …  20.0   9.0  3.0  23.0  28.0  27.0
 7.0  1.0  17.0  28.0  17.0  10.0  11.0     25.0   9.0  6.0  19.0  12.0  11.0
 3.0  0.0   7.0  13.0   7.0   8.0   4

In [77]:
[i for (i, col) in enumerate(eachcol(aamer_counts_matrix)) if all(col .== 0)]

4-element Vector{Int64}:
  2
 22
 45
 75

In [80]:
[i for (i, col) in enumerate(eachcol(dnamer_counts_matrix)) if all(col .== 0)]

Int64[]

In [79]:
dnamer_counts_matrix = accession_list_to_dnamer_counts_table(accession_list, dna_k)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:02[39m


8192×100 Matrix{Float64}:
 20.0  6.0  0.0   26.0   6.0  0.0  0.0  …   11.0   2.0  43.0   0.0  0.0  2.0
 15.0  4.0  1.0   40.0  53.0  0.0  0.0      22.0  16.0  27.0   3.0  0.0  0.0
 17.0  6.0  0.0   93.0  55.0  1.0  0.0      44.0  20.0  52.0   3.0  0.0  0.0
 29.0  0.0  1.0   38.0  50.0  0.0  0.0     105.0  11.0  69.0   1.0  0.0  2.0
 13.0  4.0  3.0   69.0  69.0  2.0  0.0      47.0  18.0  40.0   7.0  0.0  0.0
 13.0  6.0  1.0   28.0  40.0  0.0  3.0  …   27.0   7.0  13.0   3.0  2.0  1.0
 10.0  2.0  2.0   69.0  66.0  1.0  0.0      48.0  22.0  14.0   3.0  1.0  1.0
  9.0  2.0  4.0   68.0  64.0  3.0  0.0      81.0  13.0  31.0   2.0  0.0  0.0
 22.0  3.0  1.0   73.0  81.0  2.0  0.0     123.0  33.0  49.0   8.0  0.0  1.0
 22.0  2.0  2.0   92.0  64.0  2.0  1.0      64.0  15.0  36.0   5.0  0.0  0.0
 18.0  3.0  1.0   61.0  43.0  1.0  0.0  …   65.0  16.0  32.0   9.0  1.0  0.0
 11.0  2.0  1.0   66.0  53.0  0.0  0.0      79.0  16.0  26.0   5.0  0.0  1.0
 31.0  5.0  0.0   44.0  62.0  0.0  0.0     149.0  

In [56]:
unique_species = filter(!isempty, sort(unique(entity_metadata[!, "Species"])))
unique_genera = filter(!isempty, sort(unique(entity_metadata[!, "Genus"])))
unique_families = filter(!isempty, sort(unique(entity_metadata[!, "Family"])))

12-element Vector{String}:
 "Ackermannviridae"
 "Autographiviridae"
 "Demerecviridae"
 "Drexlerviridae"
 "Herelleviridae"
 "Inoviridae"
 "Microviridae"
 "Myoviridae"
 "Podoviridae"
 "Salasmaviridae"
 "Siphoviridae"
 "Zobellviridae"

In [57]:
aamer_probability_matrix = count_matrix_to_probability_matrix(aamer_counts_matrix)
dnamer_probility_matrix = count_matrix_to_probability_matrix(dnamer_counts_matrix)

8192×100 Matrix{Float64}:
 0.000547675  0.000890472  0.0         …  0.0          0.0         3.69263e-5
 0.000410756  0.000593648  1.89426e-5     4.41079e-5   0.0         0.0
 0.000465524  0.000890472  0.0            4.41079e-5   0.0         0.0
 0.000794129  0.0          1.89426e-5     1.47026e-5   0.0         3.69263e-5
 0.000355989  0.000593648  5.68279e-5     0.000102918  0.0         0.0
 0.000355989  0.000890472  1.89426e-5  …  4.41079e-5   3.42149e-5  1.84631e-5
 0.000273838  0.000296824  3.78852e-5     4.41079e-5   1.71075e-5  1.84631e-5
 0.000246454  0.000296824  7.57705e-5     2.94053e-5   0.0         0.0
 0.000602443  0.000445236  1.89426e-5     0.000117621  0.0         1.84631e-5
 0.000602443  0.000296824  3.78852e-5     7.35132e-5   0.0         0.0
 0.000492908  0.000445236  1.89426e-5  …  0.000132324  1.71075e-5  0.0
 0.000301221  0.000296824  1.89426e-5     7.35132e-5   0.0         1.84631e-5
 0.000848896  0.00074206   0.0            0.0          0.0         0.0
 ⋮       

In [58]:
matrix_metric_grammar_groups = [
        (normalize_distance_matrix(Distances.pairwise(Distances.euclidean, aamer_counts_matrix, dims=2)), "euclidean", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.euclidean, dnamer_counts_matrix, dims=2)), "euclidean", "DNA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.cityblock, aamer_counts_matrix, dims=2)), "cityblock", "AA"), # redundant with above
        (normalize_distance_matrix(Distances.pairwise(Distances.cityblock, dnamer_counts_matrix, dims=2)), "cityblock", "DNA"), # redundant with above
        
        (normalize_distance_matrix(Distances.pairwise(Distances.corr_dist, aamer_counts_matrix, dims=2)), "corr_dist", "AA"), # meh
        (normalize_distance_matrix(Distances.pairwise(Distances.corr_dist, dnamer_counts_matrix, dims=2)), "corr_dist", "DNA"), # meh
        (normalize_distance_matrix(Distances.pairwise(Distances.cosine_dist, aamer_counts_matrix, dims=2)), "cosine_dist", "AA"), # meh
        (normalize_distance_matrix(Distances.pairwise(Distances.cosine_dist, dnamer_counts_matrix, dims=2)), "cosine_dist", "DNA"), # very bad
        
        (normalize_distance_matrix(Distances.pairwise(Distances.totalvariation, aamer_probability_matrix, dims=2)), "totalvariation", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.totalvariation, dnamer_probility_matrix, dims=2)), "totalvariation", "DNA"), # bad
        (normalize_distance_matrix(Distances.pairwise(Distances.js_divergence, aamer_probability_matrix, dims=2)), "js_divergence", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.js_divergence, dnamer_probility_matrix, dims=2)), "js_divergence", "DNA"), # bad
        (normalize_distance_matrix(Distances.pairwise(Distances.bhattacharyya, aamer_probability_matrix, dims=2)), "bhattacharyya", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.bhattacharyya, dnamer_probility_matrix, dims=2)), "bhattacharyya", "DNA"), # bad
        (normalize_distance_matrix(Distances.pairwise(Distances.hellinger, aamer_probability_matrix, dims=2)), "hellinger", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.hellinger, dnamer_probility_matrix, dims=2)), "hellinger", "DNA"), # bad
    ]

16-element Vector{Tuple{Matrix{Float64}, String, String}}:
 ([0.0 0.16802446767241466 … 0.23938225052595452 0.1488040408149276; 0.16802446767241466 0.0 … 0.20231109853584742 0.1622283703100427; … ; 0.23938225052595452 0.20231109853584742 … 0.0 0.2545564354245891; 0.1488040408149276 0.1622283703100427 … 0.2545564354245891 0.0], "euclidean", "AA")
 ([0.0 0.046332018196880015 … 0.11416380539546334 0.10441996682963602; 0.046332018196880015 0.0 … 0.11777370866059689 0.10723184992881744; … ; 0.11416380539546334 0.11777370866059689 … 0.0 0.0378078291181241; 0.10441996682963602 0.10723184992881744 … 0.0378078291181241 0.0], "euclidean", "DNA")
 ([0.0 0.15148152442155124 … 0.20981499577653742 0.1294035808337612; 0.15148152442155124 0.0 … 0.17867731089653344 0.13944051542806035; … ; 0.20981499577653742 0.17867731089653344 … 0.0 0.2144028355168359; 0.1294035808337612 0.13944051542806035 … 0.2144028355168359 0.0], "cityblock", "AA")
 ([0.0 0.061452110812652845 … 0.12456400593352844 0.1152026620695

In [60]:
for (taxon_level, unique_taxa) in ("Family" => unique_families, "Genus" => unique_genera)
    for (distance_matrix, distance_metric, grammar) in matrix_metric_grammar_groups

            within_vs_between_distances = []
            ProgressMeter.@showprogress for taxon in unique_taxa
                taxa_indices = findall(entity_metadata[!, taxon_level] .== taxon)
                other_taxa_indices = setdiff(1:1:size(distance_matrix, 1), taxa_indices)
                for index in taxa_indices
                    other_indices = filter(i -> i != index, taxa_indices)
                    avg_within_taxa_distance = Statistics.mean(vec(distance_matrix[index, other_indices]))
                    avg_between_taxa_distance = Statistics.mean(vec(distance_matrix[index, other_taxa_indices]))
                    push!(within_vs_between_distances, avg_within_taxa_distance => avg_between_taxa_distance)
                end
            end
#             @show within_vs_between_distances
            within_vs_between_distances = filter(d -> !any(map(x1 -> isnan(x1), collect(d))), within_vs_between_distances)

            ys = collect.(within_vs_between_distances)

            xs = [[1, 2] for x in ys]

            if grammar == "AA"
                k = aa_k
            elseif grammar == "DNA"
                k = dna_k
            end

            n = size(distance_matrix, 1)

            p = StatsPlots.plot(
                xs,
                ys,
                xticks = ([1, 2], ["within $(taxon_level)", "between $(taxon_level)"]),
                legend = false,
                xlims = (0.75, 2.25),
                alpha = 0.2,
                title = "$(distance_metric) distance @ k=$k & $grammar\n(n=$(n))",
                ylabel = "normalized distance",
                marker = :circle
            )
            StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-$(taxon_level)-k$k-$(grammar)-n$n.png")
            StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-$(taxon_level)-k$k-$(grammar)-n$n.svg")
            display(p)
    end
end

LoadError: BoundsError: attempt to access 10×10 Matrix{Float64} at index [78, Int64[]]