# Core Proteome

# Initialize Directory

In [1]:
DATE_TASK = "2022-03-12-ecoli-tequatrovirus"
DIR = mkpath("$(homedir())/workspace/$DATE_TASK")
cd(DIR)
DATE, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

2-element Vector{Union{Nothing, SubString{String}}}:
 "2022-03-12"
 "ecoli-tequatrovirus"

# Import Packages

In [2]:
import Pkg
Pkg.update()
pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"StatsPlots",
"StatsBase",
"Statistics",
"MultivariateStats",
"Random",
"Primes",
"SparseArrays",
"SHA",
"GenomicAnnotations",
"Combinatorics",
"OrderedCollections",
"Downloads",
"Clustering",
"Revise",
"Mmap",
"LsqFit",
"BioSymbols"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

# works but can't update locally, need to push and restart kernel to activate changes
# "https://github.com/cjprybol/Mycelia.git#master",
# didn't work
# "$(homedir())/git/Mycelia#master",
pkg_path = "$(homedir())/git/Mycelia"
try
    eval(Meta.parse("import $(basename(pkg_path))"))
catch
    # Pkg.add(url=pkg)
    Pkg.develop(path=pkg_path)
    # pkg = replace(basename(pkg), ".git#master" => "")
    # pkg = replace(basename(pkg), "#master" => "")
    eval(Meta.parse("import $(basename(pkg_path))"))
end

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/cjprybol/Mycelia.git#master`
[32m[1m   Installed[22m[39m ArrayInterface ─ v5.0.3
[32m[1m  No Changes[22m[39m to `~/git/Mycelia/docs/Project.toml`
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Manifest.toml`
 [90m [4fba245c] [39m[93m↑ ArrayInterface v5.0.2 ⇒ v5.0.3[39m
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39m[90mArrayInterface[39m
[32m  ✓ [39m[90mFiniteDiff[39m
[32m  ✓ [39m[90mNLSolversBase[39m
[32m  ✓ [39m[90mOptimBase[39m
[32m  ✓ [39mLsqFit
[32m  ✓ [39mMycelia
  6 dependencies successfully precompiled in 38 seconds (247 already precompiled, 9 skipped during auto due to previous errors)


# Initialize Functions

In [3]:
function wcss(clustering_result)
    n_clusters = length(clustering_result.counts)
    total_squared_cost = 0.0
    for cluster_id in 1:n_clusters
        cluster_indices = clustering_result.assignments .== cluster_id
        total_squared_cost += sum(clustering_result.costs[cluster_indices] .^ 2)
    end
    return total_squared_cost
end

wcss (generic function with 1 method)

In [4]:
function generate_all_possible_kmers(k, alphabet)
    kmer_iterator = Iterators.product([alphabet for i in 1:k]...)
    kmer_vectors = collect.(vec(collect(kmer_iterator)))
    if eltype(alphabet) == BioSymbols.AminoAcid
        kmers = BioSequences.LongAminoAcidSeq.(kmer_vectors)
        if k > 1
            filter!(kmer -> kmer[1] != BioSequences.AA_Term, kmers)
        end
    elseif eltype(alphabet) == BioSymbols.DNA
        kmers = BioSequences.LongDNASeq.(kmer_vectors)
    else
        error()
    end
    return sort!(kmers)
end

generate_all_possible_kmers (generic function with 1 method)

In [5]:
function assess_aamer_saturation(fasta_records::AbstractVector{FASTX.FASTA.Record}, k; kmers_to_assess=Inf, power=10)
    kmers = Set{BioSequences.LongAminoAcidSeq}()
    
    max_possible_kmers = length(generate_all_possible_kmers(k, Mycelia.AA_ALPHABET))
    
    if kmers_to_assess == Inf
        kmers_to_assess = max_possible_kmers
    end
    
    sampling_points = Int[0]
    i = 0
    while power^i <= kmers_to_assess
        push!(sampling_points, power^i)
        i += 1
    end
    
    unique_kmer_counts = zeros(Int, length(sampling_points))
    
    if length(sampling_points) < 3
        @info "increase the # of reads analyzed or decrease the power to acquire more data points"
        return (;sampling_points, unique_kmer_counts)
    end
    
    p = ProgressMeter.Progress(kmers_to_assess, 1)
    
    kmers_assessed = 0
    for record in fasta_records
        # for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
        for i in 1:length(FASTX.sequence(record))-k+1
            kmer = FASTX.sequence(record)[i:i+k-1]
            push!(kmers, kmer)
            kmers_assessed += 1
            if (length(kmers) == max_possible_kmers)                 
                sampling_points = vcat(filter(s -> s < kmers_assessed, sampling_points), [kmers_assessed])
                unique_kmer_counts = vcat(unique_kmer_counts[1:length(sampling_points)-1], length(kmers))
                return (;sampling_points, unique_kmer_counts, eof = false)
            elseif kmers_assessed in sampling_points
                i = findfirst(sampling_points .== kmers_assessed)
                unique_kmer_counts[i] = length(kmers)
                if i == length(sampling_points)
                    return (sampling_points = sampling_points, unique_kmer_counts = unique_kmer_counts, eof = false)
                end
            end
            ProgressMeter.next!(p)
        end
    end
    sampling_points = vcat(filter(s -> s < kmers_assessed, sampling_points), [kmers_assessed])
    unique_kmer_counts = vcat(unique_kmer_counts[1:length(sampling_points)-1], [length(kmers)])    
    return (sampling_points = sampling_points, unique_kmer_counts = unique_kmer_counts, eof = true)
end


function assess_aamer_saturation(fastxs::AbstractVector{String}, k; kmers_to_assess=Inf, power=10)
    kmers = Set{BioSequences.LongAminoAcidSeq}()
    
    max_possible_kmers = length(generate_all_possible_kmers(k, Mycelia.AA_ALPHABET))
    
    if kmers_to_assess == Inf
        kmers_to_assess = max_possible_kmers
    end
    
    sampling_points = Int[0]
    i = 0
    while power^i <= kmers_to_assess
        push!(sampling_points, power^i)
        i += 1
    end
    
    unique_kmer_counts = zeros(Int, length(sampling_points))
    
    if length(sampling_points) < 3
        @info "increase the # of reads analyzed or decrease the power to acquire more data points"
        return (;sampling_points, unique_kmer_counts)
    end
    
    p = ProgressMeter.Progress(kmers_to_assess, 1)
    
    kmers_assessed = 0
    for fastx in fastxs
        for record in Mycelia.open_fastx(fastx)
            # for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
            for i in 1:length(FASTX.sequence(record))-k+1
                kmer = FASTX.sequence(record)[i:i+k-1]
                push!(kmers, kmer)
                kmers_assessed += 1
                if (length(kmers) == max_possible_kmers)                 
                    sampling_points = vcat(filter(s -> s < kmers_assessed, sampling_points), [kmers_assessed])
                    unique_kmer_counts = vcat(unique_kmer_counts[1:length(sampling_points)-1], length(kmers))
                    return (;sampling_points, unique_kmer_counts, eof = false)
                elseif kmers_assessed in sampling_points
                    i = findfirst(sampling_points .== kmers_assessed)
                    unique_kmer_counts[i] = length(kmers)
                    if i == length(sampling_points)
                        return (sampling_points = sampling_points, unique_kmer_counts = unique_kmer_counts, eof = false)
                    end
                end
                ProgressMeter.next!(p)
            end
        end
    end
    sampling_points = vcat(filter(s -> s < kmers_assessed, sampling_points), [kmers_assessed])
    unique_kmer_counts = vcat(unique_kmer_counts[1:length(sampling_points)-1], [length(kmers)])    
    return (sampling_points = sampling_points, unique_kmer_counts = unique_kmer_counts, eof = true)
end

function assess_aamer_saturation(fastxs; outdir="", min_k=1, max_k=15, threshold=0.1)
    
    if isempty(outdir)
        outdir = joinpath(pwd(), "aamer-saturation")
    end
    mkpath(outdir)
    
    ks = Primes.primes(min_k, max_k)
    ks = min_k:max_k
    minimum_saturation = Inf
    midpoint = Inf
    
    
    
    for k in ks
        kmers_to_assess = 10_000_000
        sampling_points, kmer_counts, hit_eof = assess_aamer_saturation(fastxs, k, kmers_to_assess=kmers_to_assess)
        @show sampling_points, kmer_counts, hit_eof
        observed_midpoint_index = findfirst(i -> kmer_counts[i] > last(kmer_counts)/2, 1:length(sampling_points))
        observed_midpoint = sampling_points[observed_midpoint_index]
        initial_parameters = Float64[maximum(kmer_counts), observed_midpoint]
        @time fit = LsqFit.curve_fit(Mycelia.calculate_v, sampling_points, kmer_counts, initial_parameters)
        if hit_eof
            inferred_maximum = last(kmer_counts)
        else
            inferred_maximum = max(Int(ceil(fit.param[1])), last(kmer_counts))
        end

        max_possible_kmers = length(generate_all_possible_kmers(k, Mycelia.AA_ALPHABET))
        
        inferred_midpoint = Int(ceil(fit.param[2]))
        predicted_saturation = inferred_maximum / max_possible_kmers
        @show k, predicted_saturation

        p = StatsPlots.scatter(
            sampling_points,
            kmer_counts,
            label="observed kmer counts",
            ylabel="# unique kmers",
            xlabel="# kmers assessed",
            title = "sequencing saturation @ k = $k",
            legend=:outertopright,
            size=(800, 400),
            margins=3StatsPlots.PlotMeasures.mm
            )
        StatsPlots.hline!(p, [max_possible_kmers], label="absolute maximum")
        StatsPlots.hline!(p, [inferred_maximum], label="inferred maximum")
        StatsPlots.vline!(p, [inferred_midpoint], label="inferred midpoint")
        # xs = vcat(sampling_points, [last(sampling_points) * 2^i for i in 1:2])
        xs = sort([sampling_points..., inferred_midpoint])
        ys = Mycelia.calculate_v(xs, fit.param)
        StatsPlots.plot!(
            p,
            xs,
            ys,
            label="fit trendline")
        display(p)
        StatsPlots.savefig(p, joinpath(outdir, "$k.png"))
        StatsPlots.savefig(p, joinpath(outdir, "$k.svg"))

        if predicted_saturation < minimum_saturation
            minimum_saturation = predicted_saturation
            min_k = k
            midpoint = inferred_midpoint 
        end
        if predicted_saturation < threshold
            chosen_k_file = joinpath(outdir, "chosen_k.txt")
            println("chosen k = $k")
            open(chosen_k_file, "w") do io
                println(io, k)
            end
            return k
        end
    end
end

assess_aamer_saturation (generic function with 3 methods)

In [6]:
function normalize_ncbi_description(description::AbstractString)
    # e.g. "hypothetical protein ecml134_001 [escherichia phage ecml-134]"
    # will return "hypothetical protein"
    # @show description
    description = replace(description, r"\s*\[.*?\]$" => "")
    # @show description
    description = replace(description, r"\s*[a-z0-9]+_[0-9]+$" => "")
    # @show description
    return description
end

normalize_ncbi_description (generic function with 1 method)

In [7]:
function document_frequency(documents)
    document_tokens = Set(split(strip(first(documents))))
    countmap = StatsBase.countmap(document_tokens)
    for document in documents[2:end]
        document_tokens = Set(split(strip(document)))
        this_countmap = StatsBase.countmap(document_tokens)
        merge!(+, countmap, this_countmap)
    end
    return countmap
end

document_frequency (generic function with 1 method)

In [8]:
# function fit_optimal_number_of_clusters(distance_matrix)
#     ks_to_try = vcat([2^i for i in 0:Int(floor(log2(size(distance_matrix, 1))))], size(distance_matrix, 1))
#     # @info "ks = $(ks_to_try)"
#     @show ks_to_try
    
#     # can calculate this for k >= 1
#     # within_cluster_sum_of_squares = Union{Float64, Missing}[]
#     within_cluster_sum_of_squares = Float64[]
#     # these are only valid for k >= 2 so set initial value to missing
#     # between_cluster_sum_of_squares = [missing, zeros(length(ks_to_try)-1)...]
#     # silhouette_scores = Union{Float64, Missing}[]
#     silhouette_scores = Float64[]
        
#     current_k_index = 1
#     # @info "assessing k = $(ks_to_try[current_k_index])"
#     @show ks_to_try[current_k_index]
#     this_clustering = Clustering.kmeans(distance_matrix, ks_to_try[current_k_index])
#     push!(within_cluster_sum_of_squares, wcss(this_clustering))
#     push!(silhouette_scores, 0)

#     if length(ks_to_try) == 1
#         optimal_number_of_clusters = ks_to_try[current_k_index]
#     else
#         current_k_index += 1
#         # @info "assessing k = $(ks_to_try[current_k_index])"
#         @show ks_to_try[current_k_index]
#         this_clustering = Clustering.kmeans(distance_matrix, ks_to_try[current_k_index])
#         push!(within_cluster_sum_of_squares, wcss(this_clustering))
#         # push!(silhouette_scores, Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix)))
#         this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
#         # this_silhouette_score /= log10(ks_to_try[current_k_index])
#         push!(silhouette_scores, this_silhouette_score)
        
#         if (within_cluster_sum_of_squares[2] >= within_cluster_sum_of_squares[1])
#             optimal_number_of_clusters = ks_to_try[1]
#         else
#             optimal_number_of_clusters = ks_to_try[2]
#             if length(ks_to_try) > 2
#                 current_k_index += 1
#                 @show ks_to_try[current_k_index]
#                 # @info "assessing k = $(ks_to_try[current_k_index])"
#                 this_clustering = Clustering.kmeans(distance_matrix, ks_to_try[current_k_index])
#                 push!(within_cluster_sum_of_squares, wcss(this_clustering))
#                 # push!(silhouette_scores, Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix)))
#                 this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
#                 # this_silhouette_score /= log10(ks_to_try[current_k_index])
#                 push!(silhouette_scores, this_silhouette_score)
                
#                 while (silhouette_scores[end] > silhouette_scores[end-1]) &&
#                         (within_cluster_sum_of_squares[end] < within_cluster_sum_of_squares[end-1]) &&
#                         (current_k_index < length(ks_to_try))
#                     current_k_index += 1
#                     @show ks_to_try[current_k_index]
#                     # @info "assessing k = $(ks_to_try[current_k_index])"
#                     this_clustering = Clustering.kmeans(distance_matrix, ks_to_try[current_k_index])
#                     push!(within_cluster_sum_of_squares, wcss(this_clustering))
#                     # push!(silhouette_scores, Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix)))
#                     this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
#                     # this_silhouette_score /= log10(ks_to_try[current_k_index])
#                     push!(silhouette_scores, this_silhouette_score)
#                 end
#                 # here is where we should start grid searching within the best range
#                 optimal_silhouette, optimal_index = findmax(silhouette_scores)
#                 optimal_number_of_clusters = ks_to_try[optimal_index]
#                 # @info "refining..."
#                 @show "refining"
#                 # @info "current optimal number of clusters = $(ks_to_try[optimal_index])"
#                 # @show ks_to_try[optimal_index]
#                 # @info "current best silhouette score = $(optimal_silhouette)"
#                 @show optimal_silhouette
                                
#                 if optimal_index != length(ks_to_try)
#                     window_of_focus = ks_to_try[optimal_index-1:optimal_index+1]
                    
#                     k_to_try = Int(round(Statistics.mean(window_of_focus[1:2])))
#                     insertion_index = first(searchsorted(ks_to_try, k_to_try))
#                     if ks_to_try[insertion_index] != k_to_try
#                         insert!(ks_to_try, insertion_index, k_to_try)
#                         # @info "assessing k = $(k_to_try)"
#                         @show k_to_try
#                         this_clustering = Clustering.kmeans(distance_matrix, k_to_try)
#                         insert!(within_cluster_sum_of_squares, insertion_index, wcss(this_clustering))
#                         # insert!(silhouette_scores, insertion_index, Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix)))
#                         this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
#                         # this_silhouette_score /= log10(k_to_try)
#                         insert!(silhouette_scores, insertion_index, this_silhouette_score)
#                     end

#                     k_to_try = Int(round(Statistics.mean(window_of_focus[2:3])))
#                     insertion_index = first(searchsorted(ks_to_try, k_to_try))
#                     if ks_to_try[insertion_index] != k_to_try
#                         # @info "assessing k = $(k_to_try)"
#                         @show k_to_try
#                         this_clustering = Clustering.kmeans(distance_matrix, k_to_try)
#                         insert!(ks_to_try, insertion_index, k_to_try)
#                         insert!(within_cluster_sum_of_squares, insertion_index, wcss(this_clustering))
#                         # insert!(silhouette_scores, insertion_index, Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix)))
#                         this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
#                         # this_silhouette_score /= log10(k_to_try)
#                         insert!(silhouette_scores, insertion_index, this_silhouette_score)
#                     end
                    
#                     new_optimal_silhouette, new_optimal_index = findmax(silhouette_scores)
#                     new_optimal_number_of_clusters = ks_to_try[new_optimal_index]
                    
#                     while (new_optimal_number_of_clusters != optimal_number_of_clusters) && (new_optimal_index != length(ks_to_try))
#                         optimal_number_of_clusters = new_optimal_number_of_clusters
#                         optimal_index = new_optimal_index
#                         optimal_silhouette = new_optimal_silhouette
#                         # @info "current optimal number of clusters = $(ks_to_try[optimal_index])"
#                         @show ks_to_try[optimal_index]
#                         # @info "current best silhouette score = $(optimal_silhouette)"
#                         @show optimal_silhouette
                        
#                         window_of_focus = ks_to_try[optimal_index-1:optimal_index+1]

#                         k_to_try = Int(round(Statistics.mean(window_of_focus[1:2])))
#                         insertion_index = first(searchsorted(ks_to_try, k_to_try))
#                         if ks_to_try[insertion_index] != k_to_try
#                             # @info "assessing k = $(k_to_try)"
#                             @show k_to_try
#                             this_clustering = Clustering.kmeans(distance_matrix, k_to_try)
#                             insert!(ks_to_try, insertion_index, k_to_try)
#                             insert!(within_cluster_sum_of_squares, insertion_index, wcss(this_clustering))
#                             # insert!(silhouette_scores, insertion_index, Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix)))
#                             this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
#                             # this_silhouette_score /= log10(k_to_try)
#                             insert!(silhouette_scores, insertion_index, this_silhouette_score)
#                         end

#                         k_to_try = Int(round(Statistics.mean(window_of_focus[2:3])))
#                         insertion_index = first(searchsorted(ks_to_try, k_to_try))
#                         if ks_to_try[insertion_index] != k_to_try
#                             # @info "assessing k = $(k_to_try)"
#                             @show k_to_try
#                             this_clustering = Clustering.kmeans(distance_matrix, k_to_try)
#                             insert!(ks_to_try, insertion_index, k_to_try)
#                             insert!(within_cluster_sum_of_squares, insertion_index, wcss(this_clustering))
#                             # insert!(silhouette_scores, insertion_index, Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix)))
#                             this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
#                             # this_silhouette_score /= log10(k_to_try)
#                             insert!(silhouette_scores, insertion_index, this_silhouette_score)
#                         end

#                         new_optimal_silhouette, new_optimal_index = findmax(silhouette_scores)
#                         new_optimal_number_of_clusters = ks_to_try[new_optimal_index]
#                     end
#                 end
#             end
#         end
#     end
#     @assert length(within_cluster_sum_of_squares) == length(silhouette_scores)
#     ks_assessed = ks_to_try[1:length(within_cluster_sum_of_squares)]
#     return optimal_number_of_clusters, ks_assessed, within_cluster_sum_of_squares, silhouette_scores
# end

In [9]:
function fit_optimal_number_of_clusters(distance_matrix)
    ks_to_try = [1, Int(round(size(distance_matrix, 1)/2)), size(distance_matrix, 1)]
    
    # Int(round(size(distance_matrix, 1)/2))
    # insert!(ks_to_try, 2, Int(round(size(distance_matrix, 1)))))
    # ks_to_try = vcat([2^i for i in 0:Int(floor(log2(size(distance_matrix, 1))))], size(distance_matrix, 1))
    # @info "ks = $(ks_to_try)"
    @show ks_to_try
    
    # can calculate this for k >= 1
    # within_cluster_sum_of_squares = Union{Float64, Missing}[]
    within_cluster_sum_of_squares = Float64[]
    # these are only valid for k >= 2 so set initial value to missing
    # between_cluster_sum_of_squares = [missing, zeros(length(ks_to_try)-1)...]
    # silhouette_scores = Union{Float64, Missing}[]
    silhouette_scores = Float64[]
        
    for k in ks_to_try
        this_clustering = Clustering.kmeans(distance_matrix, k)
        push!(within_cluster_sum_of_squares, wcss(this_clustering))
        if k == 1
            push!(silhouette_scores, 0)
        else
            this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
            push!(silhouette_scores, this_silhouette_score)
        end
    end            
    optimal_silhouette, optimal_index = findmax(silhouette_scores)
    previous_optimal_number_of_clusters = 0
    optimal_number_of_clusters = ks_to_try[optimal_index]
    done = false
    while optimal_number_of_clusters != previous_optimal_number_of_clusters
        @show optimal_number_of_clusters
        if optimal_index == 1
            window_of_focus = ks_to_try[optimal_index:optimal_index+1]
            insert!(window_of_focus, 2, Int(round(Statistics.mean(window_of_focus))))
        elseif optimal_index == length(ks_to_try)
            window_of_focus = ks_to_try[optimal_index-1:optimal_index]
            insert!(window_of_focus, 2, Int(round(Statistics.mean(window_of_focus))))
        else
            window_of_focus = ks_to_try[optimal_index-1:optimal_index+1]
        end
        # @show window_of_focus
        midpoints = [
            Int(round(Statistics.mean(window_of_focus[1:2]))),
            Int(round(Statistics.mean(window_of_focus[2:3])))
            ]
        # @show sort(vcat(midpoints, window_of_focus))
        @show midpoints
        
        for k in midpoints
            insertion_index = first(searchsorted(ks_to_try, k))
            if ks_to_try[insertion_index] != k
                insert!(ks_to_try, insertion_index, k)
                # @show k
                this_clustering = Clustering.kmeans(distance_matrix, k)
                insert!(within_cluster_sum_of_squares, insertion_index, wcss(this_clustering))
                this_silhouette_score = Statistics.mean(Clustering.silhouettes(this_clustering, distance_matrix))
                insert!(silhouette_scores, insertion_index, this_silhouette_score)
            end
        end
        
        previous_optimal_number_of_clusters = optimal_number_of_clusters
        optimal_silhouette, optimal_index = findmax(silhouette_scores)
        optimal_number_of_clusters = ks_to_try[optimal_index]
    end
    @assert length(within_cluster_sum_of_squares) == length(silhouette_scores)
    ks_assessed = ks_to_try[1:length(within_cluster_sum_of_squares)]
    return optimal_number_of_clusters, ks_assessed, within_cluster_sum_of_squares, silhouette_scores
end

fit_optimal_number_of_clusters (generic function with 1 method)

# Define Parameters

In [12]:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?&id=$(tax_id)
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?lvl=0&amp;id=2733124
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10663
root_tax_id = 10663
host = "Escherichia"

# # set to 0 for no subsetting
# subset_n = 0

# 2 was too small and produced unstable results
# subset_n = 2
subset_n = 10

10

# Workflow

## Filter down all NCBI entities to get relevant metadata info

In [13]:
# # TODO
# # here is where we should apply a filter where host == Escherichia
# # need to load host information into neo4j taxonomy

# child_tax_ids = vcat(Mycelia.taxonomic_id_to_children(root_tax_id), root_tax_id)
# # child_tax_ids = vcat(child_tax_ids, root_tax_id)

# # # refseq_metadata = Mycelia.load_refseq_metadata()
# ncbi_metadata = Mycelia.load_genbank_metadata()

In [14]:
# # show(ncbi_metadata[1:1, :], allcols=true)
# tax_id_filter = map(taxid -> taxid in child_tax_ids, ncbi_metadata[!, "taxid"])
# is_right_host = map(x -> occursin(Regex(host, "i"), x), ncbi_metadata[!, "organism_name"])
# not_excluded = ncbi_metadata[!, "excluded_from_refseq"] .== ""
# is_full = ncbi_metadata[!, "genome_rep"] .== "Full"
# # assembly_levels = ["Complete Genome"]
# assembly_levels = ["Complete Genome", "Chromosome"]
# # assembly_levels = ["Complete Genome", "Chromosome", "Scaffold"]
# # assembly_levels = ["Complete Genome", "Chromosome", "Scaffold", "Contig"]
# assembly_level_filter = map(x -> x in assembly_levels, ncbi_metadata[!, "assembly_level"])
# full_filter = is_full .& not_excluded .& assembly_level_filter .& tax_id_filter .& is_right_host
# @show count(full_filter)

# indices = findall(full_filter)

# if subset_n != 0
#     indices = StatsBase.sample(indices, subset_n)
# end

# ncbi_metadata_of_interest = ncbi_metadata[StatsBase.sample(indices, subset_n), :]

## Acquire pangenome input files

In [15]:
# # can I also get genbank record?????
# # for extension in ["genomic.fna.gz", "protein.faa.gz"]
# for extension in ["genomic.fna.gz", "protein.faa.gz", "genomic.gbff.gz"]
#     outdir = mkpath(joinpath(DIR, extension))
#     ProgressMeter.@showprogress for row in DataFrames.eachrow(ncbi_metadata_of_interest)
#         url = Mycelia.ncbi_ftp_path_to_url(row["ftp_path"], extension)
#         outfile = joinpath(outdir, basename(url))
#         if !isfile(outfile)
#             try
#                 Downloads.download(url, outfile)
#             catch e
#                 # @show e
#                 showerror(stdout, e)
#                 # @assert extension == "protein.faa.gz"
#                 # here is where we should call prodigal to fill in protein annotations if we don't otherwise see them
#             end
#         end
#     end
# end

## Assess Protein Clusters

### This section generates a distance matrix for the individual proteins, so we can find clusters

In [82]:
extension = "protein.faa.gz"
outdir = mkpath(joinpath(DIR, extension))
joint_fasta_outfile = outdir * ".joint.faa.gz"
fastx_files = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(outdir, join=true))

8-element Vector{String}:
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_002618005.2_ASM261800v1_protein.faa.gz"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_005394485.1_ASM539448v1_protein.faa.gz"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_005394635.1_ASM539463v1_protein.faa.gz"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_020474825.1_ASM2047482v1_protein.faa.gz"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_020474885.1_ASM2047488v1_protein.faa.gz"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_020869095.1_ASM2086909v1_protein.faa.gz"
 "/home/jupyter-cameron.prybol/workspace/2022-03-1

In [31]:

# record_table = DataFrames.DataFrame(
#     fastx_file = String[],
#     record_identifier = String[],
#     record_description = String[]
# )
# ProgressMeter.@showprogress for fastx_file in fastx_files
#     for record in Mycelia.open_fastx(fastx_file)
#         row = (
#             fastx_file = fastx_file,
#             record_identifier = FASTX.identifier(record),
#             record_description = FASTX.description(record)
#         )
#         push!(record_table, row)
#     end
# end
# record_table
# if !isfile(joint_fasta_outfile)
#     open(joint_fasta_outfile, "w") do io
#         for fastx_file in fastx_files
#             write(io, read(fastx_file))
#         end
#     end
# end

In [32]:
# # conda install -c bioconda diamond
# @time run(`diamond makedb --in $(joint_fasta_outfile) -d $(joint_fasta_outfile)`)

# N_RECORDS = DataFrames.nrow(record_table)
# sensitivity = [
#     # "--fast",                   #enable fast mode
#     # "--mid-sensitive",          #enable mid-sensitive mode
#     # "--sensitive",              #enable sensitive mode
#     # "--more-sensitive",         #enable more sensitive mode
#     # "--very-sensitive",         #enable very sensitive mode
#     # "--ultra-sensitive",        #enable ultra sensitive mode
#     "--iterate"                #iterated search with increasing sensitivity
# ]

# # iterate
# # Total time = 1.16s
# # Reported 46718 pairwise alignments, 46718 HSPs.
# # sensitive
# # Total time = 5.673s
# # Reported 49976 pairwise alignments, 49976 HSPs.
# # ultra sensitive
# # Total time = 14.939s
# # Reported 52446 pairwise alignments, 52446 HSPs.


# @time run(`diamond blastp $(sensitivity) --id 0 --min-score 0 --max-target-seqs $(N_RECORDS) --unal 1 --outfmt 6 qseqid sseqid pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore -d $(joint_fasta_outfile).dmnd -q $(joint_fasta_outfile) -o $(joint_fasta_outfile).diamond.tsv`)
# # pairwise output is all of the alignments, super helpful!
# # @time run(`diamond blastp $(sensitivity) --id 0 --min-score 0 --max-target-seqs $(N_RECORDS) --unal 1 --outfmt 0  -d $(joint_fasta_outfile).dmnd -q $(joint_fasta_outfile) -o $(joint_fasta_outfile).diamond.pairwise.txt`)

In [33]:
# column_names_to_types = [
#     "qseqid" => String,
#     "sseqid" => String,
#     "pident" => Float64,
#     "length" => Int,
#     "mismatch" => Int,
#     "gapopen" => Int,
#     "qlen" => Int,
#     "qstart" => Int,
#     "qend" => Int,
#     "slen" => Int,
#     "sstart" => Int,
#     "send" => Int,
#     "evalue" => Float64,
#     "bitscore" => Float64,
# ]

In [34]:
# use to infer column types
# println(join(first.(column_names_to_types), '\t'))
# open("$(joint_fasta_outfile).diamond.tsv") do io
#     println(readline(io))
#     println(readline(io))
#     println(readline(io))
# end

In [35]:
# # type_detect_rows = max(countlines("$(joint_fasta_outfile).diamond.tsv") / 10, 2_000)
# blastp_results = DataFrames.DataFrame(uCSV.read("$(joint_fasta_outfile).diamond.tsv", header=0, delim='\t', types = Dict(i => t for (i, t) in enumerate(last.(column_names_to_types))))[1], first.(column_names_to_types))
# uCSV.write("$(joint_fasta_outfile).diamond.with_header.tsv", blastp_results, delim='\t')
# show(blastp_results, allcols=true)

In [36]:
# id_to_index_map = Dict(identifier => i for (i, identifier) in enumerate(record_table[!, "record_identifier"]))

In [37]:
# distance_matrix = ones(N_RECORDS, N_RECORDS)
# for row in DataFrames.eachrow(blastp_results)
#     row_idx = id_to_index_map[row["qseqid"]]
#     col_idx = id_to_index_map[row["sseqid"]]
#     # distance = 1 - (row["pident"] / 100)
#     sequence_identity = row["pident"] / 100
#     size_identity = row["length"] / max(row["qlen"], row["slen"])
#     overall_identity = sequence_identity * size_identity
#     distance = 1 - (overall_identity)
#     distance_matrix[row_idx, col_idx] = distance
# end
# distance_matrix

In [38]:
# just percent identity
# Summary Stats:
# Length:         13942756
# Missing Count:  0
# Mean:           0.996645
# Minimum:        0.000000
# 1st Quartile:   1.000000
# Median:         1.000000
# 3rd Quartile:   1.000000
# Maximum:        1.000000
# Type:           Float64

# percent size and percent identity
# Summary Stats:
# Length:         13942756
# Missing Count:  0
# Mean:           0.996742
# Minimum:        0.000000
# 1st Quartile:   1.000000
# Median:         1.000000
# 3rd Quartile:   1.000000
# Maximum:        1.000000
# Type:           Float64

# StatsBase.describe(vec(distance_matrix))

In [39]:
# # sample_n = min(size(distance_matrix, 1), 10_000)
# # sample_n = size(distance_matrix, 1)
# # sample_indices = StatsBase.sample(1:size(distance_matrix, 1), sample_n, ordered=true)
# # 517.958321 seconds (282.75 k allocations: 1.888 GiB, 10.37% gc time, 0.06% compilation time)

# # @time optimal_number_of_clusters, ks_assessed, within_cluster_sum_of_squares, silhouette_scores = fit_optimal_number_of_clusters(distance_matrix[sample_indices, sample_indices])
# @time optimal_number_of_clusters, ks_assessed, within_cluster_sum_of_squares, silhouette_scores = fit_optimal_number_of_clusters(distance_matrix)

In [40]:
# p1 = StatsPlots.plot(
#     ks_assessed[1:length(within_cluster_sum_of_squares)],
#     within_cluster_sum_of_squares,
#     ylabel = "within cluster sum of squares\n(lower is better)",
#     xlabel = "n clusters",
#     legend=false
# )
# StatsPlots.vline!(p1, [optimal_number_of_clusters])
# p2 = StatsPlots.plot(
#     ks_assessed[1:length(silhouette_scores)],
#     silhouette_scores,
#     ylabel = "silhouette scores\n(higher is better)",
#     xlabel = "n clusters",
#     title = "Optimal n clusters = $(optimal_number_of_clusters)",
#     legend=false
# )
# StatsPlots.vline!(p2, [optimal_number_of_clusters])
# # TODO write me out
# display(p2)
# display(p1)

In [41]:
# optimal_clustering_result = Clustering.kmeans(distance_matrix, optimal_number_of_clusters)
# record_table[!, "cluster_assignments"] = optimal_clustering_result.assignments
# uCSV.write("$(joint_fasta_outfile).cluster_metadata.tsv", record_table, delim='\t')
# show(record_table, allcols=true)

## Generate protein cluster summary profiles

In [42]:
record_table = DataFrames.DataFrame(uCSV.read("$(joint_fasta_outfile).cluster_metadata.tsv", delim='\t', header=1)...)

Unnamed: 0_level_0,fastx_file
Unnamed: 0_level_1,String
1,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
2,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
3,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
4,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
5,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
6,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
7,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
8,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
9,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz
10,/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz/GCA_000925055.1_ViralProj266786_protein.faa.gz


In [43]:
record_table[!, "normalized_description"] = normalize_ncbi_description.(lowercase.(record_table[!, "record_description"]))

2183-element Vector{String}:
 "hypothetical protein"
 "hypothetical protein"
 "dna gyrase subunit b"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 "hypothetical protein"
 ⋮
 "anti-restriction nuclease"
 "flagellar motor protein"
 "hypothetical protein"
 "dna topoisomerase ii medium subunit"
 "hypothetical protein"
 "naphthalene 1,2-dioxygenase"
 "naphthalene 1,2-dioxygenase"
 "hypothetical protein"
 "hypothetical protein"
 "dna endonuclease iv"
 "endonuclease"
 "riia lysis inhibitor"

In [44]:
show(record_table, allcols=true)

[1m2183×5 DataFrame[0m
[1m  Row [0m│[1m fastx_file                        [0m[1m record_identifier [0m[1m record_description                [0m[1m cluster_assignments [0m[1m normalized_description            [0m
[1m      [0m│[90m String                            [0m[90m String            [0m[90m String                            [0m[90m Int64               [0m[90m String                            [0m
──────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    1 │ /home/jupyter-cameron.prybol/wor…  AFO10528.1         hypothetical protein ECML134_001…                   61  hypothetical protein
    2 │ /home/jupyter-cameron.prybol/wor…  AFO10529.1         hypothetical protein ECML134_002…                  132  hypothetical protein
    3 │ /home/jupyter-cameron.prybol/wor…  AFO10530.1         DNA gyrase subunit B [Escherichi…                  342  dna gyrase subunit 

In [53]:
df = document_frequency(record_table[!, "normalized_description"])

function term_frequency(document::AbstractString)
    return term_frequency([documents])
end

function term_frequency(documents::AbstractVector)
    countmap = StatsBase.countmap(split(strip(first(documents))))
    for document in documents[2:end]
        merge!(+, countmap, StatsBase.countmap(split(strip(document))))
    end
    return countmap
end

In [87]:
joint_fasta_outfile

"/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.faa.gz.joint.faa.gz"

In [88]:
cluster_names = String[]
for (i, cluster) in enumerate(DataFrames.groupby(sort(record_table, "cluster_assignments"), "cluster_assignments"))
    @assert cluster[1, "cluster_assignments"] == i
    tfs = collect(term_frequency(cluster[!, "normalized_description"]))
    tf_idfs = [term => frequency / df[term] for (term, frequency) in tfs]
    tf_idfs = filter(x -> !any(y -> x[1] != y[1] && occursin(x[1], y[1]), tf_idfs), tf_idfs)
    sort!(tf_idfs, by=x->x[2], rev=true)
    push!(cluster_names, join(first.(tf_idfs), " "))
    
end
cluster_name_table = DataFrames.DataFrame(
    cluster_id = sort(unique(record_table[!, "cluster_assignments"])),
    cluster_name = cluster_names
)
uCSV.write("$(joint_fasta_outfile).cluster_descriptions.tsv", cluster_name_table, delim='\t')

In [90]:
# heatmap of clusters against genomes
n_fastas = length(fastx_files)
n_clusters = DataFrames.nrow(cluster_name_table)
fasta_cluster_containment_matrix = falses(n_fastas, n_clusters)

for (i, fastx_file_group) in enumerate(DataFrames.groupby(record_table, "fastx_file"))
    clusters_contained = unique(fastx_file_group[!, "cluster_assignments"])
    for cluster in clusters_contained
        fasta_cluster_containment_matrix[i, cluster] = true
    end
end

clusters_ordered_by_coreness = sortperm(map(col -> sum(col), eachcol(fasta_cluster_containment_matrix)), rev=true)
p = StatsPlots.heatmap(
    fasta_cluster_containment_matrix[:, clusters_ordered_by_coreness],
    # legend = false,
    title = "Core and accessory protein clusters",
    ylabel = "genome index",
    xlabel = "ordered protein clusters",
    # yticks = false,
    # xticks = false,
    margins = 1StatsPlots.cm
)
StatsPlots.savefig(p, "$(joint_fasta_outfile).core-clusters.svg")
StatsPlots.savefig(p, "$(joint_fasta_outfile).core-clusters.png")

In [None]:
# CREATE CONSUSES PROTEIN FOR EACH PROTEIN CLUSTER

In [93]:
clusters = sort(unique(record_table[!, "cluster_assignments"]))
cluster_fasta_files = [replace(joint_fasta_outfile, ".faa.gz" => "") .* ".cluster_$(cluster).faa" for cluster in clusters]

359-element Vector{String}:
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_1.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_2.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_3.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_4.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_5.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_6.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_7.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_8.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_9.faa"
 "/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequa

In [95]:
# for each cluster, write out cluster to a specific fasta file

cluster_fasta_ios = [FASTX.FASTA.Writer(open(f, "w")) for f in cluster_fasta_files]
for record in Mycelia.open_fastx(joint_fasta_outfile)
    # [FASTX.identifier(record)]
    record_index = findfirst(record_table[!, "record_identifier"] .== FASTX.identifier(record))
    cluster_assignment = record_table[record_index, "cluster_assignments"]
    cluster_io = cluster_fasta_ios[cluster_assignment]
    write(cluster_io, record)
end
for io_stream in cluster_fasta_ios
    close(io_stream)
end

In [None]:
# write out clustalw alignment for each fasta

In [96]:
ProgressMeter.@showprogress for cluster_fasta_file in cluster_fasta_files
    # for outfmt in ["fasta", "clustal", "msf", "phylip", "selex", "stockholm", "vienna"]
    for outfmt in ["clustal"]
        outfile = "$(cluster_fasta_file).clustal_omega.$(outfmt)"
        if !isfile(outfile)
            try
                run(`clustalo -i $(cluster_fasta_file) --outfmt $(outfmt) -o $(outfile)`)
            catch e
                # FATAL: File '...' contains 1 sequence, nothing to align
                continue
            end
        end
    end
end

[32mProgress:  36%|███████████████                          |  ETA: 0:00:23[39mFATAL: File '/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_135.faa' contains 1 sequence, nothing to align
[32mProgress:  53%|█████████████████████▉                   |  ETA: 0:00:15[39mFATAL: File '/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_193.faa' contains 1 sequence, nothing to align
[32mProgress:  58%|███████████████████████▊                 |  ETA: 0:00:13[39mFATAL: File '/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_210.faa' contains 1 sequence, nothing to align
[32mProgress:  60%|████████████████████████▌                |  ETA: 0:00:13[39mFATAL: File '/home/jupyter-cameron.prybol/workspace/2022-03-12-ecoli-tequatrovirus/protein.joint.cluster_216.faa' contains 1 sequence, nothing to align
[32mProgress:  64%|██████████████████████████▍              |  ETA:

In [None]:
fastx_to_aamer_graph

In [None]:
read in the list of fastas

In [None]:
count aamers

In [None]:
initialize graph with aamer nodes and counts

In [None]:
add edges

In [None]:
edges have weights too?