In [2]:
TODAY="2021-11-13"
TASK = "phylogenetic-determination"
DIR = "$(homedir())/$(TODAY)-$(TASK)"
if !isdir(DIR)
    mkdir(DIR)
end
cd(DIR)

In [3]:
import Pkg

pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"Plots",
"StatsPlots",
"StatsBase",
"Statistics",
"Mmap",
"MultivariateStats",
"PyCall",
"Random",
"Primes",
"Revise",
"SparseArrays",
"SHA",
"Mycelia",
"GenomicAnnotations",
"BioFetch",
"Combinatorics",
"StaticArrays",
"BioSymbols",
"RollingFunctions",
"OrderedCollections"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

In [27]:
function generate_all_possible_kmers(k, alphabet)
    kmer_iterator = Iterators.product([alphabet for i in 1:k]...)
    kmer_vectors = collect.(vec(collect(kmer_iterator)))
    if eltype(alphabet) == BioSymbols.AminoAcid
        kmers = BioSequences.LongAminoAcidSeq.(kmer_vectors)
    elseif eltype(alphabet) == BioSymbols.DNA
        kmers = BioSequences.LongDNASeq.(kmer_vectors)
    else
        error()
    end
    return sort!(kmers)
end

function generate_all_possible_canonical_kmers(k, alphabet)
    kmers = generate_all_possible_kmers(k, alphabet)
    if eltype(alphabet) == BioSymbols.AminoAcid
        return kmers
    elseif eltype(alphabet) == BioSymbols.DNA
        return BioSequences.DNAMer.(unique!(BioSequences.canonical.(kmers)))
    else
        error()
    end
end

generate_all_possible_canonical_kmers (generic function with 1 method)

In [35]:
function count_canonical_aamers(k, fasta_proteins)
    aamer_counts = OrderedCollections.OrderedDict{BioSequences.LongAminoAcidSeq, Int64}()
    for protein in fasta_proteins
        s = FASTX.sequence(protein)
        these_counts = sort(StatsBase.countmap([s[i:i+k-1] for i in 1:length(s)-1]))
        merge!(+, aamer_counts, these_counts)
    end
    return sort(aamer_counts)
end

count_canonical_aamers (generic function with 1 method)

In [24]:
function update_counts_matrix!(matrix, sample_index, countmap, sorted_kmers)
    for (i, kmer) in enumerate(sorted_kmers)
        matrix[i, sample_index] = get(countmap, kmer, 0)
    end
    return matrix
end

update_counts_matrix! (generic function with 1 method)

In [62]:
function accession_list_to_aamer_counts_table(accession_list, k, AA_ALPHABET)
    
    canonical_aamers = generate_all_possible_canonical_kmers(k, AA_ALPHABET)
    
    aamer_counts_matrix = zeros(length(canonical_aamers), length(accession_list))
    
    ProgressMeter.@showprogress for (entity_index, accession) in enumerate(accession_list)
        entity_genbank = BioFetch.fetchseq(accession, format = BioFetch.gb)
        fasta_proteins = Vector{FASTX.FASTA.Record}()
        for gene in GenomicAnnotations.@genes(entity_genbank, CDS)
            try
                dna_seq = GenomicAnnotations.sequence(gene)
                # if seq isn't divisible by 3, cut final bases that can't be mapped to a codon
                translateable_length = div(length(dna_seq), 3) * 3
                truncated_dna_seq = dna_seq[1:translateable_length]
                aa_seq = BioSequences.translate(truncated_dna_seq)
                id = getproperty(gene, :protein_id)
                product = getproperty(gene, :product)
                record = FASTX.FASTA.Record(id, product, aa_seq)
                push!(fasta_proteins, record)
            catch
                @error GenomicAnnotations.sequence(gene)
                error()
            end

        end
# #         fasta_proteins = 
# #             [
# #                 [FASTX.FASTA.Record(getproperty(gene, :protein_id),
# #                                     getproperty(gene, :product),
# #                                     GenomicAnnotations.sequence(gene; translate = true)
# #                                 )
# #                     for gene in GenomicAnnotations.@genes(chromosome, CDS)
# #                 ]
# #                         for chromosome in entity_genbank
# #             ]
#         fasta_proteins =
#                 [
#                                     ,
                                    
#                                 )
#                     for gene in 
#                 ]
#         @show fasta_proteins
        entity_aamer_counts = count_canonical_aamers(aa_k, fasta_proteins)
        update_counts_matrix!(aamer_counts_matrix, entity_index, entity_aamer_counts, canonical_aamers)
    end
    return aamer_counts_matrix
end

accession_list_to_aamer_counts_table (generic function with 2 methods)

In [65]:
function accession_list_to_dnamer_counts_table(accession_list, k)
    canonical_dnamers = generate_all_possible_canonical_kmers(k, DNA_ALPHABET)
    dnamer_counts_matrix = zeros(length(canonical_dnamers), length(accession_list))

    ProgressMeter.@showprogress for (entity_index, accession) in enumerate(accession_list)
        entity_genbank = BioFetch.fetchseq(accession, format = BioFetch.gb)
        fasta_dna_sequences = [
            FASTX.FASTA.Record(chromosome.name, chromosome.sequence) 
                for chromosome in entity_genbank
                              ]
        entity_dnamer_counts = Mycelia.count_canonical_kmers(BioSequences.DNAMer{dna_k}, fasta_dna_sequences)
        update_counts_matrix!(dnamer_counts_matrix, entity_index, entity_dnamer_counts, canonical_dnamers)
    end
    return dnamer_counts_matrix    
end

accession_list_to_dnamer_counts_table (generic function with 2 methods)

In [89]:
function normalize_distance_matrix(distance_matrix)
    return distance_matrix ./ maximum(distance_matrix)
end

normalize_distance_matrix (generic function with 1 method)

In [74]:
function counts_matrix_to_euclidean_distance_matrix(counts_matrix)
    n_samples = size(counts_matrix, 2)
    distance_matrix = zeros(n_samples, n_samples)
    for x1 in 1:n_samples
        x1_values = counts_matrix[:, x1]
        for x2 in 2:n_samples
            x2_values = counts_matrix[:, x2]
            d = Distances.euclidean(x1_values, x2_values)
            distance_matrix[x1, x2] = distance_matrix[x2, x1] = d
        end
    end
    return distance_matrix
end

counts_matrix_to_euclidean_distance_matrix (generic function with 1 method)

In [75]:
function counts_matrix_to_js_divergence_matrix(counts_matrix)
    n_samples = size(counts_matrix, 2)
    distance_matrix = zeros(n_samples, n_samples)
    for x1 in 1:n_samples
        x1_values = counts_matrix[:, x1] ./ sum(counts_matrix[:, x1])
        for x2 in 2:n_samples
            x2_values = counts_matrix[:, x2] ./ sum(counts_matrix[:, x2])
            d = Distances.js_divergence(x1_values, x2_values)
            distance_matrix[x1, x2] = distance_matrix[x2, x1] = d
        end
    end
    return distance_matrix
end

counts_matrix_to_js_divergence_matrix (generic function with 1 method)

In [77]:
function counts_matrix_to_jaccard_distance_matrix(counts_matrix)
    n_samples = size(counts_matrix, 2)
    distance_matrix = zeros(n_samples, n_samples)
    for x1 in 1:n_samples
        x1_values = counts_matrix[:, x1] .> 0
        for x2 in 2:n_samples
            x2_values = counts_matrix[:, x2] .> 0
            d = Distances.jaccard(x1_values, x2_values)
            distance_matrix[x1, x2] = distance_matrix[x2, x1] = d
        end
    end
    return distance_matrix
end

counts_matrix_to_jaccard_distance_matrix (generic function with 1 method)

In [6]:
# MYCELIA_METADATA = joinpath(Pkg.dir("Mycelia"), "metadata")
MYCELIA_METADATA = joinpath(dirname(dirname(pathof(Mycelia))),  "metadata")

"/home/jupyter-cjprybol/.julia/dev/Mycelia/metadata"

In [7]:
# https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Bacteriophage,%20all%20taxids&Completeness_s=complete
entity_metadata = DataFrames.DataFrame(uCSV.read("$(MYCELIA_METADATA)/2021-11-13-ncbi-complete-bacteriophage.csv", header=1, quotes='"')...)
entity_metadata = entity_metadata[entity_metadata[!, "Nuc_Completeness"] .== "complete", :]
entity_metadata = entity_metadata[entity_metadata[!, "Sequence_Type"] .== "RefSeq", :]
sort!(entity_metadata, "Accession")

Unnamed: 0_level_0,Accession,SRA_Accession,Submitters
Unnamed: 0_level_1,String,String,String
1,NC_000866.4,,"Miller,E.S., Kutter,E., Mosig,G., Arisaka,F., Kunisawa,T., Ruger,W., Ho,C.K., Shuman,S., Shcherbakov,V., Granovsky,I., Plugina,L., Shcherbakova,T., Sizova,S., Pyatkov,K., Shlyapnikov,M., Shubina,O., Xu,W., Gauss,P., Shen,J., Dunn,C.A., Bessman,M.J., Truncaite,L., Zajanckauskaite,A., Nivinskas,R., Belle,A., Landthaler,M., Shub,D.A., Stehr,M., Schneider,G., Aslund,F., Holmgren,A., Lindqvist,Y., Ramanculov,E., Young,R., Vaiskunaite,R., Miller,A., Davenport,L., Colowick,N.E., Pietz,B.C., Franklin,J.L., Haseltine,D., Wang,F.J., Ripley,L.S., Paddison,P., Abedon,S.T., Dressman,H.K., Gailbreath,K., Tracy,J., Mosser,E., Neitzel,J., Guttman,B., Kim,B.C., Kim,K., Park,E.H., Lim,C.J., Kadyrov,F.A., Shlyapnikov,M.G., Kryukov,V.M., Malys,N., Carles-Kinch,K., George,J.W., Kreuzer,K.N., Kai,T., Selick,H.E., Yonesaki,T., Penner,M., Morad,I., Snyder,L., Kaufmann,G., Bova,R., Cascino,A., Cipollaro,M., Gargano,S., Grau,O., Micheli,M.R., Santoro,M., Scarlato,V., Storlazzi,A., Young,P., Ohman,M., Sjoberg,B.M., Koch,T., Bouet,J.Y., Woszczyk,J., Repoila,F., Francois,V., Louarn,J.M., Krisch,H.M., Stormo,G.D., Dyson,R.L., Alberts,B.M., Orsini,G., Ouhammouch,M., Le Caer,J.P., Brody,E.N., Hacker,K.J., Sanson,B., Uzan,M., Sharma,M., Ellis,R.L., Hinton,D.M., Raudonikiene,A., Viteniene,I., Gruidl,M.E., Chen,T.C., Efimov,V.P., Prilipov,A.G., Mesyanzhinov,V.V., Brody,E., Favre,R., Kaliman,A.V., Khasanova,M.A., Tanyashin,V.I., Bayev,A.A., Powell,D., Franklin,J., Daegelen,P., Frazier,M.W., Maley,G.F., Duceman,B.W., Wang,A.M., Martinez,J., Maley,F., Marusich,E.I., Lin,G.W., Fan,W.H., Hahn,S., Hilse,D., Lu,M.J., Henning,U., Lamm,N., Selivanov,N.A., Black,L.W., Nikolaeva,L.I., Tseng,M.J., Hilfinger,J.M., Walsh,A., Greenberg,G.R., Spicer,E.K., Rush,J., Fung,C., Reha-Krantz,L.J., Karam,J.D., Konigsberg,W.H., Ishimoto,L.K., Ishimoto,K.S., Eiserling,F.A., Nakako,T., Takahashi,H., Ishii,S., Wang,Y., Mathews,C.K., Ishimoto,L., Kassavetis,G., Kumazaki,T., Huang,W.M., Ao,S.Z., Casjens,S., Orlandi,R., Zeikus,R., Weiss,R., Winge,D., Fang,M., Montag,D., Degen,M., Riede,I., Eschbach,M.L., Tomaschewski,J., Hsu,T., Wei,R.X., Dawson,M., Gayle,M., Winter,R.B., Gold,L., Chu,F.K., Valerie,K., Stevens,J., Lynch,M., Henderson,E.E., de Riel,J.K., Hahne,S., Mathews,C.Z., Rand,K.N., Gait,M.J., West,D.K., Belfort,M., Keller,B., Bickle,T.A., Gram,H., Crabb,J.W., Fujisawa,H., Minagawa,T., Broida,J., Abelson,J., Midgley,C.A., Murray,N.E., Parker,M.L., Christensen,A.C., Boosman,A., Stockard,J., Young,E.T., Doermann,A.H., Macdonald,P.M., Sengstag,C., Kellenberger,E., Drivdahl,R., Rand,K., Trojanowska,M., Karam,J., Stormo,G., Purohit,S., Armstrong,J., Brown,R.S., Tsugita,A., Owen,J.E., Schultz,D.W., Taylor,A., Smith,G.R., Volker,T.A., Gafner,J., Showe,M.K., Allet,B., Oliver,D.B., Crowther,R.A., Pribnow,D., Sigurdson,D.C., Singer,B.S., Napoli,C., Brosius,J., Dull,T.J., Noller,H.F., Anderson,B., Zurabishvili,T., Marusich,E., Schneider,M., Mullins,T., Napuli,A., Mesyanzhinov,V., Drake,J.W., Nguyen,D., Dressman,H., Goldberg,E.B., Ueno,H., Otsuko,Y., Morimoto,W., Kreuzer,K., Stidham,T., Thomas,E., Mzhavia,N., Djavachishvili,T., Peterson,S., Eidemiller,J., Awaya,M., Canada,W., Dimitroff,B., Blattner,F., Nakanishi,M., Alberts,B., Yasuda,G., Parker,M., Doermann,G."
2,NC_000867.1,,"Kivela,H.M., Mannisto,R.H., Kalkkinen,N., Bamford,D.H., Paulin,L., Bamford,J.K., Bamford,J.K.H."
3,NC_000871.1,,"Desiere,F., Lucchini,S., Brussow,H."
4,NC_000872.1,,"Desiere,F., Lucchini,S., Brussow,H., Bruttin,A."
5,NC_000896.1,,"Altermann,E., Klein,J.R., Henrich,B., Engel,G., Klein,J., Binishofer,B., Blasi,U., Fremaux,C., De Antoni,G.L., Raya,R.R., Klaenhammer,T.R."
6,NC_000902.1,,"Miyamoto,H., Nakai,W., Yajima,N., Fujibayashi,A., Higuchi,T., Sato,K., Matsushiro,A."
7,NC_000924.1,,"Plunkett,G. III, Rose,D.J., Durfee,T.J., Blattner,F.R."
8,NC_000929.1,,"Morgan,G.J., Hatfull,G.F., Casjens,S., Hendrix,R.W., Morgan,G., Hatfull,G., Hendrix,R."
9,NC_000935.1,,"van der Wilk,F., Dullemans,A.M., Verbeek,M., van den Heuvel,J.F., van den Heuvel,J.F.J.M."
10,NC_001271.1,,"Pajunen,M.I., Kiljunen,S.J., Soderholm,M.E., Skurnik,M."


In [8]:
Random.seed!(0)
n_samples = 10
subset_indices = StatsBase.sample(1:size(entity_metadata, 1), n_samples, ordered=true, replace=false)
entity_metadata = entity_metadata[subset_indices, :]
# n_samples = size(entity_metadata, 1)

Unnamed: 0_level_0,Accession,SRA_Accession,Submitters
Unnamed: 0_level_1,String,String,String
1,NC_005355.1,,"Ventura,M., Canchaya,C., Pridmore,R.D., Brussow,H., Canchaya,C.A., Pridmore,D., Bruessow,H."
2,NC_020844.1,,"Henn,M.R., Dillon,J., Levin,J., Malboeuf,C., Casali,M., Russ,C., Lennon,N., Chapman,S.B., Erlich,R., Young,S.K., Yandava,C., Zeng,Q., Alvarado,L., Anderson,S., Berlin,A., Chen,Z., Freedman,E., Gellesch,M., Goldberg,J., Green,L., Griggs,A., Gujja,S., Heilman,E.R., Heiman,D., Hollinger,A., Howarth,C., Larson,L., Mehta,T., Pearson,M., Roberts,A., Ryan,E., Saif,S., Shea,T., Shenoy,N., Sisk,P., Stolte,C., Sykes,S., White,J., Haas,B., Nusbaum,C., Birren,B."
3,NC_022057.1,,"Farina,J., Richards,K., Hiller,J., Gannon,D., Reifsnyder,R., Vogel,A., Ganski,A., Massaley,M., Carzo,S., Brower,C., Semler,R., Smith,V., Friel,S., Flynn,L., Moran,D.J., Wodarski,D.M., Harrison,M.A., Dunbar,D.A., Wang,X., Crowell,R., Bostrom,M.A., Burke,M., Wright,G.M., Gregory,S.G., Colman,S.D., Bradley,K.W., Khaja,R., Lewis,M.F., Barker,L.P., Asai,D.J., Bowman,C.A., Russell,D.A., Pope,W.H., Jacobs-Sera,D., Hendrix,R.W., Hatfull,G.F."
4,NC_027365.1,,"Hatfull,G.F., Jamborcic,A., Chen,D., Mazahreh,R., Pirahanchi,Y., Mcburney-Lin,J., Beckham,A., Rich,B., Cidambi,V., Brooks,G., Jones,S., Ricci-Tam,C., Evangelista,N., Shi,E., Matinrad,H., Coker,J., Hill,R., Lam,D., Zhang,J., Dimopoulos,C., Khan,S., Wong,A., Chou,J., Hwang,V., Chen,A., Zhang,C., Wu,K., Nguyen,E., Magarian,J., Nonejuie,P., Pogliano,K., Pogliano,J., Lee,V., Hendricks,S.L., Voegtly,L.J., Wang,Y., Glascock,A.L., Anderson,J., Williamson,S.M., Walstead,R.N., Carvalho,M.R.C., Johnson,A., Buck,G.A., Bradley,K.W., Khaja,R., Lewis,M.F., Barker,L.P., Jordan,T.C., Russell,D.A., Pope,W.H., Jacobs-Sera,D., Hendrix,R.W."
5,NC_028667.1,,"Pourcel,C., Midoux,C., Bourkaltseva,M., Pleteneva,E., Krylov,V."
6,NC_028974.1,,"Djamen,P.Y., Nguyen,L., Gibbs,Z.A., Donegan-Quick,R., Visi,D.K., Allen,M.S., Hughes,L.E., Bradley,K.W., Asai,D.J., Bowman,C.A., Russell,D.A., Pope,W.H., Jacobs-Sera,D., Hendrix,R.W., Hatfull,G.F."
7,NC_031230.1,,"Bandyopadhyay,A., Carlton,M.L., Kane,M.T., Panchal,N.J., Pham,Y.C., Reynolds,Z.J., Sapienza,M.S., German,B.A., McDonnell,J.E., Schafer,C.E., Yu,V.J., Warner,M.H., Furbee,E.C., Grubb,S.R., Montgomery,M.T., Garlena,R.A., Russell,D.A., Pope,W.H., Jacobs-Sera,D., Hendrix,R.W., Hatfull,G.F."
8,NC_047712.1,,"Gregory,A.C., LaButti,K., Copeland,A., Woyke,T., Sullivan,M.B."
9,NC_051670.1,,"Alvarez,R.V., Connors,B., Jenkins,J., Bandura,A.J., Lazo,A.N., Lorang,A.C., Mazariego,B.E., Mendez,P.D., Myrthil,G.D., Wright,J., Gurney,S.M.R., Garlena,R.A., Russell,D.A., Pope,W.H., Jacobs-Sera,D., Hatfull,G.F."
10,NC_054986.1,,"Pujato,S.A., Guglielmotti,D.M., Martinez-Garcia,M., Quiberoni,A., Mojica,F.J.M., Mojica,F.J."


In [9]:
# download genomic fasta files
show(entity_metadata, allcols=true)

[1m10×23 DataFrame[0m
[1m Row [0m│[1m Accession   [0m[1m SRA_Accession [0m[1m Submitters                        [0m[1m Release_Date         [0m[1m Isolate   [0m[1m Species                           [0m[1m Genus          [0m[1m Family       [0m[1m Molecule_type [0m[1m Length [0m[1m Sequence_Type [0m[1m Nuc_Completeness [0m[1m Genotype [0m[1m Segment [0m[1m Publications [0m[1m Geo_Location                     [0m[1m Country [0m[1m USA    [0m[1m Host                              [0m[1m Isolation_Source [0m[1m Collection_Date [0m[1m BioSample [0m[1m GenBank_Title                     [0m
[1m     [0m│[90m String      [0m[90m String        [0m[90m String                            [0m[90m String               [0m[90m String    [0m[90m String                            [0m[90m String         [0m[90m String       [0m[90m String        [0m[90m Int64  [0m[90m String        [0m[90m String           [0m[90m String   [0m[

In [None]:
dna_k = 5
aa_k = 2

AA_ALPHABET = collect(filter(x -> x != BioSequences.AA_Term, Mycelia.AA_ALPHABET))
DNA_ALPHABET = Mycelia.DNA_ALPHABET

In [64]:
accession_list = entity_metadata[!, "Accession"]
aa_k = 2
AA_ALPHABET
aamer_counts_matrix = accession_list_to_aamer_counts_table(accession_list, aa_k, AA_ALPHABET)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:08[39m


484×10 Matrix{Float64}:
  48.0  119.0  329.0  374.0  223.0  385.0  270.0  289.0  260.0   6.0
  27.0   79.0  166.0  175.0  102.0  152.0  191.0  172.0  133.0  13.0
  56.0   48.0   57.0   62.0   42.0   55.0   82.0  220.0   50.0  42.0
  38.0   88.0  128.0  147.0  100.0  136.0  180.0  218.0  124.0  36.0
   3.0   11.0   36.0   20.0   13.0   19.0   28.0   34.0   24.0   0.0
  44.0   55.0   79.0   99.0   77.0   87.0  103.0  151.0   66.0  26.0
  61.0  108.0  192.0  208.0  125.0  174.0  194.0  246.0  136.0  16.0
  52.0  104.0  193.0  196.0  125.0  202.0  197.0  318.0  177.0  44.0
   8.0   17.0   43.0   43.0   27.0   43.0   62.0   62.0   46.0   2.0
  68.0   50.0  118.0   98.0   80.0   62.0  143.0  246.0   77.0  50.0
  72.0  108.0  201.0  189.0  135.0  163.0  238.0  313.0  149.0  40.0
 103.0   51.0   91.0   68.0   51.0   64.0  120.0  256.0   76.0  28.0
  17.0   26.0   50.0   40.0   39.0   44.0   52.0   88.0   28.0  11.0
   ⋮                                  ⋮                         
   0.0    0.0 

In [67]:
dna_k = 5
dnamer_counts_matrix = accession_list_to_dnamer_counts_table(accession_list, dna_k)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:08[39m


512×10 Matrix{Float64}:
 395.0  102.0   27.0   15.0   14.0    3.0   17.0  304.0   42.0  213.0
 182.0   92.0   30.0   25.0   29.0   12.0   42.0  461.0   68.0  130.0
 270.0  108.0   19.0   16.0   22.0    7.0   35.0  370.0   28.0  135.0
 315.0   57.0   24.0    8.0    6.0    5.0   17.0  305.0   23.0  174.0
 173.0   76.0   25.0   18.0   29.0    6.0   61.0  683.0   76.0  108.0
  74.0   86.0   84.0   59.0   54.0   24.0  110.0  683.0  148.0   79.0
  68.0   95.0   66.0   43.0   46.0   33.0   77.0  237.0   79.0   78.0
 166.0   67.0   18.0   12.0   25.0    8.0   45.0  847.0   50.0   93.0
 267.0   91.0   33.0   18.0   33.0   16.0   51.0  776.0   36.0   78.0
 184.0  123.0   52.0   42.0   47.0    6.0   51.0  236.0   66.0   84.0
 121.0   73.0   24.0   19.0   33.0   19.0   61.0  531.0   51.0   66.0
 178.0   62.0   16.0   16.0   14.0   11.0   40.0  545.0   36.0   93.0
 252.0   33.0    7.0    6.0    7.0    1.0   17.0  497.0   12.0  189.0
   ⋮                                  ⋮                         
 

In [87]:
euclidean_distance_matrix = counts_matrix_to_euclidean_distance_matrix(aamer_counts_matrix)

10×10 Matrix{Float64}:
    0.0     363.395   930.143   828.972  …  3426.15   676.033   292.828
  363.395     0.0     740.48    616.57      3407.29   462.511   431.268
  930.143   740.48      0.0     365.032     2883.35   401.759  1084.02
  828.972   616.57    365.032     0.0       3126.63   323.5     946.88
  481.347   291.405   681.845   508.162     3417.06   423.458   542.825
  799.046   585.655   435.738   284.974  …  3184.68   324.302   897.018
 1249.0    1114.11    547.489   790.581     2478.87   818.754  1436.61
 3426.15   3407.29   2883.35   3126.63         0.0   3131.89   3626.07
  676.033   462.511   401.759   323.5       3131.89     0.0     785.882
  292.828   431.268  1084.02    946.88      3626.07   785.882     0.0

10×10 Matrix{Float64}:
 0.0        0.100217   0.256515  0.228614   …  0.944867  0.186437   0.0807562
 0.100217   0.0        0.20421   0.170038      0.939666  0.127551   0.118935
 0.256515   0.20421    0.0       0.100669      0.795172  0.110797   0.298952
 0.228614   0.170038   0.100669  0.0           0.862263  0.0892149  0.261131
 0.132746   0.0803639  0.18804   0.140141      0.942359  0.116782   0.149701
 0.220361   0.161512   0.120168  0.0785902  …  0.878272  0.0894363  0.24738
 0.34445    0.30725    0.150987  0.218027      0.683623  0.225796   0.39619
 0.944867   0.939666   0.795172  0.862263      0.0       0.863715   1.0
 0.186437   0.127551   0.110797  0.0892149     0.863715  0.0        0.216731
 0.0807562  0.118935   0.298952  0.261131      1.0       0.216731   0.0

In [86]:
jaccard_distance_matrix = counts_matrix_to_jaccard_distance_matrix(aamer_counts_matrix)

10×10 Matrix{Float64}:
 0.0        0.0075  0.0075  0.01       …  0.0075  0.0075  0.0075  0.0678392
 0.0075     0.0     0.0     0.0025        0.0     0.0     0.0     0.07
 0.0075     0.0     0.0     0.0025        0.0     0.0     0.0     0.07
 0.01       0.0025  0.0025  0.0           0.0025  0.0025  0.0025  0.0676692
 0.0125313  0.01    0.01    0.0125        0.01    0.01    0.01    0.0703518
 0.0075     0.0     0.0     0.0025     …  0.0     0.0     0.0     0.07
 0.0075     0.0     0.0     0.0025        0.0     0.0     0.0     0.07
 0.0075     0.0     0.0     0.0025        0.0     0.0     0.0     0.07
 0.0075     0.0     0.0     0.0025        0.0     0.0     0.0     0.07
 0.0678392  0.07    0.07    0.0676692     0.07    0.07    0.07    0.0

In [90]:
normalized_jaccard_distance_matrix = normalize_distance_matrix(jaccard_distance_matrix)

10×10 Matrix{Float64}:
 0.0       0.106607   0.106607   0.142143   …  0.106607   0.106607   0.964286
 0.106607  0.0        0.0        0.0355357     0.0        0.0        0.995
 0.106607  0.0        0.0        0.0355357     0.0        0.0        0.995
 0.142143  0.0355357  0.0355357  0.0           0.0355357  0.0355357  0.961869
 0.178124  0.142143   0.142143   0.177679      0.142143   0.142143   1.0
 0.106607  0.0        0.0        0.0355357  …  0.0        0.0        0.995
 0.106607  0.0        0.0        0.0355357     0.0        0.0        0.995
 0.106607  0.0        0.0        0.0355357     0.0        0.0        0.995
 0.106607  0.0        0.0        0.0355357     0.0        0.0        0.995
 0.964286  0.995      0.995      0.961869      0.995      0.995      0.0

In [91]:
js_divergence_matrix = counts_matrix_to_js_divergence_matrix(aamer_counts_matrix)
normalized_js_divergence_matrix = normalize_distance_matrix(js_divergence_matrix)

10×10 Matrix{Float64}:
 0.0       0.392346  0.577981   0.733273  …  0.198749  0.600575   0.20184
 0.392346  0.0       0.150022   0.244279     0.279905  0.170571   0.570267
 0.577981  0.150022  0.0        0.112222     0.373758  0.0740063  0.763536
 0.733273  0.244279  0.112222   0.0          0.544724  0.113679   0.903351
 0.59137   0.209471  0.170857   0.172598     0.504903  0.186445   0.752823
 0.827008  0.302399  0.15205    0.128813  …  0.597838  0.151395   1.0
 0.420278  0.11946   0.100075   0.2094       0.256556  0.121783   0.592963
 0.198749  0.279905  0.373758   0.544724     0.0       0.378828   0.244621
 0.600575  0.170571  0.0740063  0.113679     0.378828  0.0        0.757206
 0.20184   0.570267  0.763536   0.903351     0.244621  0.757206   0.0

In [None]:
unique_species = filter(!isempty, sort(unique(reference_phage_metadata[!, "Species"])))
unique_genera = filter(!isempty, sort(unique(reference_phage_metadata[!, "Genus"])))
unique_families = filter(!isempty, sort(unique(reference_phage_metadata[!, "Family"])))

Euclidean seems better than cosine distance
Is jensen shannon divergence better than euclidean
mash with k=13 and s=1_000 was not good
try smaller 9
try larger s

In [None]:
# distance_matrix = distance_matrix_raw_kmer_counts_euclidean
# distance_metric = "euclidean"

# distance_matrix = distance_matrix_raw_kmer_counts_cosine
# distance_metric = "cosine"

# distance_matrix = distance_matrix_raw_kmer_probability_js_divergence
# distance_metric = "js_divergence"

# distance_matrix = mash_distance_matrix
# distance_metric = "mash"

distance_matrix = ani_distance_matrix
distance_metric = "ani"

In [None]:
intra_species_distances = Float64[]
inter_species_distances = Float64[]
ProgressMeter.@showprogress for species in unique_species
    species_indices = findall(reference_phage_metadata[!, "Species"] .== species)
    other_species_indices = setdiff(1:size(distance_matrix, 1), species_indices)
    for index in species_indices
        other_indices = filter(i -> i != index, species_indices)
        append!(intra_species_distances, vec(distance_matrix[index, other_indices]))
        append!(inter_species_distances, vec(distance_matrix[index, other_species_indices]))
    end
end

In [None]:
StatsBase.describe(StatsBase.sample(inter_species_distances, 10_000))

In [None]:
StatsBase.describe(StatsBase.sample(intra_species_distances, 10_000))

In [None]:
n_points = 1_000
p = StatsPlots.scatter(
    [
        [rand(Bool) ? 1 + rand()/3 : 1 - rand()/3 for i in 1:n_points],
        [rand(Bool) ? 2 + rand()/3 : 2 - rand()/3 for i in 1:n_points]
    ],
    [
        StatsBase.sample(intra_species_distances, n_points),
        StatsBase.sample(inter_species_distances, n_points)
    ],
    label = ["within species" "between species"],
    legend = :outertopright,
    alpha = 0.5,
    title = "$(distance_metric) distance @ k=$k"
)
p = StatsPlots.hline(p,
    [Statistics.mean(intra_species_distances) + 3*Statistics.std(intra_species_distances)],
    label="mean + 3σ"
)
StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-species-k$k.png")
StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-species-k$k.svg")
p

In [None]:
intra_genera_distances = Float64[]
inter_genera_distances = Float64[]
ProgressMeter.@showprogress for genus in unique_genera
    genus_indices = findall(reference_phage_metadata[!, "Genus"] .== genus)
    other_genera_indices = setdiff(1:1:size(distance_matrix, 1), genus_indices)
    for index in genus_indices
        other_indices = filter(i -> i != index, genus_indices)
        append!(intra_genera_distances, vec(distance_matrix[index, other_indices]))
        append!(inter_genera_distances, vec(distance_matrix[index, other_genera_indices]))
    end
end

In [None]:
StatsBase.describe(StatsBase.sample(inter_genera_distances, 10_000))

In [None]:
StatsBase.describe(StatsBase.sample(intra_genera_distances, 10_000))

In [None]:
n_points = 1_000
p = StatsPlots.scatter(
    [
        [rand(Bool) ? 1 + rand()/3 : 1 - rand()/3 for i in 1:n_points],
        [rand(Bool) ? 2 + rand()/3 : 2 - rand()/3 for i in 1:n_points]
    ],
    [
        StatsBase.sample(intra_genera_distances, n_points),
        StatsBase.sample(inter_genera_distances, n_points)
    ],
    label = ["within genera" "between genera"],
    legend = :outertopright,
    alpha = 0.5,
    title = "$(distance_metric) distance @ k=$k"
)
p = StatsPlots.hline(p,
    [Statistics.mean(intra_genera_distances) + 3*Statistics.std(intra_genera_distances)],
    label="mean + 3σ"
)
StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-genus-k$k.png")
StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-genus-k$k.svg")
p

In [None]:
intra_family_distances = Float64[]
inter_family_distances = Float64[]
ProgressMeter.@showprogress for family in unique_families
    family_indices = findall(reference_phage_metadata[!, "Family"] .== family)
    other_family_indices = setdiff(1:1:size(distance_matrix, 1), family_indices)
    for index in family_indices
        other_indices = filter(i -> i != index, family_indices)
        append!(intra_family_distances, vec(distance_matrix[index, other_indices]))
        append!(inter_family_distances, vec(distance_matrix[index, other_family_indices]))
    end
end

In [None]:
StatsBase.describe(StatsBase.sample(inter_family_distances, 10_000))

In [None]:
StatsBase.describe(StatsBase.sample(intra_family_distances, 10_000))

In [None]:
n_points = 1_000
p = StatsPlots.scatter(
    [
        [rand(Bool) ? 1 + rand()/3 : 1 - rand()/3 for i in 1:n_points],
        [rand(Bool) ? 2 + rand()/3 : 2 - rand()/3 for i in 1:n_points]
    ],
    [
        StatsBase.sample(intra_family_distances, n_points),
        StatsBase.sample(inter_family_distances, n_points)
    ],
    label = ["within families" "between families"],
    legend = :outertopright,
    alpha = 0.5,
    title = "$(distance_metric) distance @ k=$k"
)
p = StatsPlots.hline(p,
    [Statistics.mean(intra_family_distances) + 3*Statistics.std(intra_family_distances)],
    label="mean + 3σ"
)
StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-family-k$k.png")
StatsPlots.savefig(p, "$DIR/$(distance_metric)-distance-family-k$k.svg")
p

In [None]:
# kmer_counts_matrix = nothing
# distance_matrix_raw_kmer_counts_cosine = nothing
# distance_matrix_raw_kmer_counts_euclidean = nothing
# GC.gc()