In [None]:
TODAY="2021-11-13"
TASK = "phylogenetic-determination"
DIR = "$(homedir())/$(TODAY)-$(TASK)"
if !isdir(DIR)
    mkdir(DIR)
end
cd(DIR)

In [None]:
import Pkg

pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"Plots",
"StatsPlots",
"StatsBase",
"Statistics",
"Mmap",
"MultivariateStats",
"PyCall",
"Random",
"Primes",
"Revise",
"SparseArrays",
"SHA",
"Mycelia",
"GenomicAnnotations",
"BioFetch",
"Combinatorics",
"StaticArrays",
"BioSymbols",
"RollingFunctions",
"OrderedCollections"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

In [None]:
function generate_all_possible_kmers(k, alphabet)
    kmer_iterator = Iterators.product([alphabet for i in 1:k]...)
    kmer_vectors = collect.(vec(collect(kmer_iterator)))
    if eltype(alphabet) == BioSymbols.AminoAcid
        kmers = BioSequences.LongAminoAcidSeq.(kmer_vectors)
    elseif eltype(alphabet) == BioSymbols.DNA
        kmers = BioSequences.LongDNASeq.(kmer_vectors)
    else
        error()
    end
    return sort!(kmers)
end

function generate_all_possible_canonical_kmers(k, alphabet)
    kmers = generate_all_possible_kmers(k, alphabet)
    if eltype(alphabet) == BioSymbols.AminoAcid
        return kmers
    elseif eltype(alphabet) == BioSymbols.DNA
        return BioSequences.DNAMer.(unique!(BioSequences.canonical.(kmers)))
    else
        error()
    end
end

In [None]:
function count_canonical_aamers(k, fasta_proteins)
    aamer_counts = OrderedCollections.OrderedDict{BioSequences.LongAminoAcidSeq, Int64}()
    for protein in fasta_proteins
        s = FASTX.sequence(protein)
        these_counts = sort(StatsBase.countmap([s[i:i+k-1] for i in 1:length(s)-k-1]))
        merge!(+, aamer_counts, these_counts)
    end
    return sort(aamer_counts)
end

In [None]:
function update_counts_matrix!(matrix, sample_index, countmap, sorted_kmers)
    for (i, kmer) in enumerate(sorted_kmers)
        matrix[i, sample_index] = get(countmap, kmer, 0)
    end
    return matrix
end

In [None]:
function accession_list_to_aamer_counts_table(accession_list, k, AA_ALPHABET)
    
    canonical_aamers = generate_all_possible_canonical_kmers(k, AA_ALPHABET)
    
    aamer_counts_matrix = zeros(length(canonical_aamers), length(accession_list))
    
    ProgressMeter.@showprogress for (entity_index, accession) in enumerate(accession_list)
        entity_genbank = BioFetch.fetchseq(accession, format = BioFetch.gb)
        fasta_proteins = Vector{FASTX.FASTA.Record}()
        for gene in GenomicAnnotations.@genes(entity_genbank, CDS)
            try
                dna_seq = GenomicAnnotations.sequence(gene)
                # if seq isn't divisible by 3, cut final bases that can't be mapped to a codon
                translateable_length = div(length(dna_seq), 3) * 3
                truncated_dna_seq = dna_seq[1:translateable_length]
                aa_seq = BioSequences.translate(truncated_dna_seq)
                id = getproperty(gene, :protein_id)
                product = getproperty(gene, :product)
                record = FASTX.FASTA.Record(id, product, aa_seq)
                push!(fasta_proteins, record)
            catch
#                 @error GenomicAnnotations.sequence(gene)
#                 @error GenomicAnnotations.sequence(gene)
                continue
#                 error()
            end

        end
        entity_aamer_counts = count_canonical_aamers(aa_k, fasta_proteins)
        update_counts_matrix!(aamer_counts_matrix, entity_index, entity_aamer_counts, canonical_aamers)
    end
    return aamer_counts_matrix
end

In [None]:
function accession_list_to_dnamer_counts_table(accession_list, k)
    canonical_dnamers = generate_all_possible_canonical_kmers(k, DNA_ALPHABET)
    dnamer_counts_matrix = zeros(length(canonical_dnamers), length(accession_list))

    ProgressMeter.@showprogress for (entity_index, accession) in enumerate(accession_list)
        entity_genbank = BioFetch.fetchseq(accession, format = BioFetch.gb)
        fasta_dna_sequences = [
            FASTX.FASTA.Record(chromosome.name, chromosome.sequence) 
                for chromosome in entity_genbank
                              ]
        entity_dnamer_counts = Mycelia.count_canonical_kmers(BioSequences.DNAMer{dna_k}, fasta_dna_sequences)
        update_counts_matrix!(dnamer_counts_matrix, entity_index, entity_dnamer_counts, canonical_dnamers)
    end
    return dnamer_counts_matrix    
end

In [None]:
function normalize_distance_matrix(distance_matrix)
    max_non_nan_value = maximum(filter(x -> !isnan(x) && !isnothing(x) && !ismissing(x), vec(distance_matrix)))
    return distance_matrix ./ max_non_nan_value
end

In [None]:
function count_matrix_to_probability_matrix(counts_matrix)
    probability_matrix = copy(counts_matrix)
    for (i, col) in enumerate(eachcol(probability_matrix))
        probability_matrix[:, i] .= col ./ sum(col)
    end
    return probability_matrix
end

In [None]:
# MYCELIA_METADATA = joinpath(Pkg.dir("Mycelia"), "metadata")
MYCELIA_METADATA = joinpath(dirname(dirname(pathof(Mycelia))),  "metadata")

In [None]:
AA_ALPHABET = collect(filter(x -> x != BioSequences.AA_Term, Mycelia.AA_ALPHABET))
DNA_ALPHABET = Mycelia.DNA_ALPHABET

- Jaccard doesn't seem to differentiate sufficiently between sequences without getting into really large k sizes
- Euclidean is consistently seperating within vs between @ 10 samples of family
- JS and Cosine have some entities that are going the wrong way regarding distance is less between families

In [None]:
# n_samples = 10
# n_samples = 100
n_samples = 1000

In [None]:
# https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Bacteriophage,%20all%20taxids&Completeness_s=complete
entity_metadata = DataFrames.DataFrame(uCSV.read("$(MYCELIA_METADATA)/2021-11-13-ncbi-complete-bacteriophage.csv", header=1, quotes='"')...)
entity_metadata = entity_metadata[entity_metadata[!, "Nuc_Completeness"] .== "complete", :]
entity_metadata = entity_metadata[entity_metadata[!, "Sequence_Type"] .== "RefSeq", :]
sort!(entity_metadata, "Accession")
Random.seed!(0)
subset_indices = StatsBase.sample(1:size(entity_metadata, 1), n_samples, ordered=true, replace=false)
entity_metadata = entity_metadata[subset_indices, :]

In [None]:
accession_list = entity_metadata[!, "Accession"]

In [None]:
# these are too small, all of the within vs between have some disagreement
# dna_k = 5
# aa_k = 2
dna_k = 7
aa_k = 3

In [None]:
# http://leb.snu.ac.kr/ezaai

In [None]:
# rep_origin      complement(16763)
# variation       complement(2665)
#      misc_feature    complement(3871)

#      variation       complement(3939)
#      variation       complement(20826)
#      modified_base   complement(29281)
#      modified_base   complement(29282)
#      modified_base   complement(29295)
#      modified_base   complement(31645)
#      modified_base   complement(31646)
#      modified_base   complement(31669)
#      modified_base   complement(31671)
#      modified_base   complement(31689)
#      modified_base   complement(31691)
#      modified_base   complement(31692)
#      modified_base   complement(32533)
#      modified_base   complement(32534)
#      modified_base   complement(32542)
#      modified_base   complement(32551)
#      modified_base   complement(32568)
#      modified_base   complement(33589)
#      modified_base   complement(33599)
#      modified_base   complement(33600)
#      modified_base   complement(33973)
#      modified_base   complement(33974)
#      modified_base   complement(33982)
#      modified_base   complement(33988)
#      modified_base   complement(33991)
#      modified_base   complement(34009)
#      variation       complement(49332)
#      variation       complement(50257)
#      variation       complement(51626)
#      variation       complement(92822)
#      variation       complement(101898)
#      variation       complement(102426)
#      variation       complement(105008)
#      variation       complement(107430)


In [None]:
aamer_counts_matrix = accession_list_to_aamer_counts_table(accession_list, aa_k, AA_ALPHABET)

In [None]:
dnamer_counts_matrix = accession_list_to_dnamer_counts_table(accession_list, dna_k)

In [None]:
unique_species = filter(!isempty, sort(unique(entity_metadata[!, "Species"])))
unique_genera = filter(!isempty, sort(unique(entity_metadata[!, "Genus"])))
unique_families = filter(!isempty, sort(unique(entity_metadata[!, "Family"])))

In [None]:
aamer_probability_matrix = count_matrix_to_probability_matrix(aamer_counts_matrix)
dnamer_probility_matrix = count_matrix_to_probability_matrix(dnamer_counts_matrix)

In [None]:
matrix_metric_grammar_groups = [
        (normalize_distance_matrix(Distances.pairwise(Distances.euclidean, aamer_counts_matrix, dims=2)), "euclidean", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.euclidean, dnamer_counts_matrix, dims=2)), "euclidean", "DNA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.cityblock, aamer_counts_matrix, dims=2)), "cityblock", "AA"), # redundant with above
        (normalize_distance_matrix(Distances.pairwise(Distances.cityblock, dnamer_counts_matrix, dims=2)), "cityblock", "DNA"), # redundant with above
        
        (normalize_distance_matrix(Distances.pairwise(Distances.corr_dist, aamer_counts_matrix, dims=2)), "corr_dist", "AA"), # meh
        (normalize_distance_matrix(Distances.pairwise(Distances.corr_dist, dnamer_counts_matrix, dims=2)), "corr_dist", "DNA"), # meh
        (normalize_distance_matrix(Distances.pairwise(Distances.cosine_dist, aamer_counts_matrix, dims=2)), "cosine_dist", "AA"), # meh
        (normalize_distance_matrix(Distances.pairwise(Distances.cosine_dist, dnamer_counts_matrix, dims=2)), "cosine_dist", "DNA"), # very bad
        
        (normalize_distance_matrix(Distances.pairwise(Distances.totalvariation, aamer_probability_matrix, dims=2)), "totalvariation", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.totalvariation, dnamer_probility_matrix, dims=2)), "totalvariation", "DNA"), # bad
        (normalize_distance_matrix(Distances.pairwise(Distances.js_divergence, aamer_probability_matrix, dims=2)), "js_divergence", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.js_divergence, dnamer_probility_matrix, dims=2)), "js_divergence", "DNA"), # bad
        (normalize_distance_matrix(Distances.pairwise(Distances.bhattacharyya, aamer_probability_matrix, dims=2)), "bhattacharyya", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.bhattacharyya, dnamer_probility_matrix, dims=2)), "bhattacharyya", "DNA"), # bad
        (normalize_distance_matrix(Distances.pairwise(Distances.hellinger, aamer_probability_matrix, dims=2)), "hellinger", "AA"), # good
        (normalize_distance_matrix(Distances.pairwise(Distances.hellinger, dnamer_probility_matrix, dims=2)), "hellinger", "DNA"), # bad
    ]

In [None]:
for (distance_matrix, distance_metric, grammar) in matrix_metric_grammar_groups
    
        within_vs_between_distances = []
        ProgressMeter.@showprogress for family in unique_families
            family_indices = findall(entity_metadata[!, "Family"] .== family)
            other_family_indices = setdiff(1:1:size(distance_matrix, 1), family_indices)
            for index in family_indices
                other_indices = filter(i -> i != index, family_indices)
                avg_within_family_distance = Statistics.mean(vec(distance_matrix[index, other_indices]))
                avg_between_family_distance = Statistics.mean(vec(distance_matrix[index, other_family_indices]))
                push!(within_vs_between_distances, avg_within_family_distance => avg_between_family_distance)
            end
        end
        within_vs_between_distances = filter(d -> !any(map(x1 -> isnan(x1), collect(d))), within_vs_between_distances)

        ys = collect.(within_vs_between_distances)

        xs = [[1, 2] for x in ys]
    
        if grammar == "AA"
            k = aa_k
        elseif grammar == "DNA"
            k = dna_k
        end
    
        n = size(distance_matrix, 1)

        p = StatsPlots.plot(
            xs,
            ys,
            xticks = ([1, 2], ["within family", "between families"]),
            legend = false,
            xlims = (0.75, 2.25),
            alpha = 0.5,
            title = "$(distance_metric) distance @ k=$k & $grammar\n(n=$(n))",
            ylabel = "normalized distance",
            marker = :circle
        )
        StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-family-k$k-$(grammar)-n$n.png")
        StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-family-k$k-$(grammar)-n$n.svg")
        display(p)
end

In [None]:
for (distance_matrix, distance_metric, grammar) in matrix_metric_grammar_groups
        within_vs_between_distances = []
        ProgressMeter.@showprogress for genus in unique_genera
            within_indices = findall(entity_metadata[!, "Genus"] .== genus)
            between_indices = setdiff(1:1:size(distance_matrix, 1), within_indices)
            for index in within_indices
                other_within_indices = filter(i -> i != index, within_indices)
                avg_within_distance = Statistics.mean(vec(distance_matrix[index, other_within_indices]))
                avg_between_distance = Statistics.mean(vec(distance_matrix[index, between_indices]))
                push!(within_vs_between_distances, avg_within_distance => avg_between_distance)
            end
        end
        within_vs_between_distances = filter(d -> !any(map(x1 -> isnan(x1), collect(d))), within_vs_between_distances)

        ys = collect.(within_vs_between_distances)

        xs = [[1, 2] for x in ys]
    
        if grammar == "AA"
            k = aa_k
        elseif grammar == "DNA"
            k = dna_k
        end
    
        n = size(distance_matrix, 1)

        p = StatsPlots.plot(
            xs,
            ys,
            xticks = ([1, 2], ["within genus", "between genera"]),
            legend = false,
            xlims = (0.75, 2.25),
            alpha = 0.5,
            title = "$(distance_metric) distance @ k=$k & $grammar\n(n=$(n))",
            ylabel = "normalized distance",
            marker = :circle
        )
        StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-genus-k$k-$(grammar)-n$n.png")
        StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-genus-k$k-$(grammar)-n$n.svg")
        display(p)
end