In [None]:
DATE = "2022-02-20"
TASK = "sample-core-genome"
DIR = mkpath("$(homedir())/workspace/$(DATE)-$(TASK)")
cd(DIR)

In [None]:
pkgs = [
    "Graphs",
    "MetaGraphs",
    "FileIO",
    "JLD2",
    "DataStructures",
    "BioSequences",
    "Random",
    "FASTX",
    # "Revise",
    "StatsBase",
    "Statistics",
    "Clustering",
    "StatsPlots",
    "Primes"
]

import Pkg
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(pkg)"))
end

import Mycelia

In [None]:
function update_remaining_targets(current_walk::AbstractVector{T}, remaining_targets::AbstractSet{T}) where T <: BioSequences.AbstractMer
    # assess whether targets have been hit in the canonical space
    remaining_targets = setdiff(BioSequences.canonical.(remaining_targets), BioSequences.canonical.(current_walk))
    # blow back out into forward and reverse_complement space
    remaining_targets = Set{T}(vcat(remaining_targets, BioSequences.reverse_complement.(remaining_targets)))
    return remaining_targets
end

In [None]:
function observe(records::AbstractVector{R};
                weights=ones(length(records)),
                N = length(records),
                outfile = "",
                error_rate = 0.0) where {R <: Union{FASTX.FASTA.Record, FASTX.FASTQ.Record}}
    if isempty(outfile)
        error("no file name supplied")
    end
    io = open(outfile, "w")
    fastx_io = FASTX.FASTA.Writer(io)
    for i in 1:N
        record = StatsBase.sample(records, StatsBase.weights(weights))
        new_seq = Mycelia.observe(FASTX.sequence(record), error_rate=error_rate)
        new_seq_id = Random.randstring(Int(ceil(log(length(new_seq) + 1))))
        new_seq_description = FASTX.identifier(record)
        observed_record = FASTX.FASTA.Record(new_seq_id, new_seq_description, new_seq)
        write(fastx_io, observed_record)
    end
    close(fastx_io)
    close(io)
    return outfile
end

In [None]:
function assess_downstream_weight(graph, kmer)
    # here we look to see if walking forward or backward from the initial node gets us to heavier weight options
    score = 0
    for neighbor in BioSequences.neighbors(kmer)
        try
            score += MetaGraphs.get_prop(graph, graph[BioSequences.canonical(neighbor), :kmer], :count)
        catch
            continue
        end
    end
    return score
end

In [None]:
# vertices should either be entire graph (by default) or a connected component
# if people want to work on just the connected component, let them induce a subgraph
function find_graph_core(graph; seed=rand(Int))
    
    Random.seed!(seed)
    
    T = typeof(MetaGraphs.get_prop(graph, 1, :kmer))
    
    targets = [MetaGraphs.get_prop(graph, v, :kmer) for v in Graphs.vertices(graph)]
    
    starting_kmer = first(targets)
    max_degree = 0
    for node in targets
        node_degree = Graphs.degree(graph, graph[node, :kmer])
        if node_degree > max_degree
            max_degree = node_degree
            starting_kmer = node
        end
    end
        
    current_walk = [starting_kmer]
    prior_walk_length = length(current_walk)
    remaining_targets = update_remaining_targets(current_walk, Set(targets))
    done = isempty(remaining_targets)
    
    while !done
        # here we look to see if walking forward or backward from the current ends gets us to heavier weight options
        # we want to prioritize walks toward higher coverage nodes
        forward_score = assess_downstream_weight(graph, last(current_walk))
        reverse_score = assess_downstream_weight(graph, BioSequences.reverse_complement(first(current_walk)))
        if reverse_score > forward_score
            current_walk = reverse(BioSequences.reverse_complement.(current_walk))
        end
        
        forward_source = last(current_walk)
        forward_walk, forward_distance = Mycelia.dijkstra(graph, forward_source, remaining_targets, search_strategy=:DFS)
        current_walk = vcat(current_walk, forward_walk[2:end])
        remaining_targets = update_remaining_targets(current_walk, remaining_targets)
        if isempty(remaining_targets)
            done = true
        else
            reverse_source = BioSequences.reverse_complement(first(current_walk))
            reverse_walk, reverse_distance = Mycelia.dijkstra(graph, reverse_source, remaining_targets, search_strategy=:DFS)
            current_walk = vcat(reverse(BioSequences.reverse_complement.(reverse_walk))[1:end-1], current_walk)
            remaining_targets = update_remaining_targets(current_walk, remaining_targets)
            current_walk
        end
        failed_this_expansion = length(current_walk) == prior_walk_length
        prior_walk_length = length(current_walk)
        if isempty(remaining_targets)
            done = true
        elseif failed_this_expansion
            done = true
        end
    end
    return current_walk
end

In [None]:
function apply_kmedoids_treshold(graph)
    kmer_counts = [MetaGraphs.get_prop(graph, v, :count) for v in Graphs.vertices(graph)]

    kmer_counts_histogram = sort(collect(StatsBase.countmap(values(kmer_counts))), by=x->x[1])

#     scale = 250
#     p = Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), log_scale=log2, title="kmer frequencies")
#     display(p)

#     p = StatsPlots.scatter(log2.(first.(kmer_counts_histogram)))
#     display(p)

    kmer_depth_of_coverage_bins = log2.(first.(kmer_counts_histogram))

    distance_matrix = zeros((length(kmer_depth_of_coverage_bins), length(kmer_depth_of_coverage_bins)))
    for (row, depth_of_coverage_bin_1) in enumerate(kmer_depth_of_coverage_bins)
        for (col, depth_of_coverage_bin_2) in enumerate(kmer_depth_of_coverage_bins)
            distance = abs(depth_of_coverage_bin_1 - depth_of_coverage_bin_2)
            distance_matrix[row, col] = distance
        end
    end
    distance_matrix

    # max out k at the same max k we use for DNAMers
    max_k = min(length(kmer_depth_of_coverage_bins), 63)
    ks = Primes.primes(2, max_k)
    ys = map(k ->
                Statistics.mean(Statistics.mean(Clustering.silhouettes(Clustering.kmedoids(distance_matrix, k), distance_matrix)) for i in 1:100),
                ks)

    p = StatsPlots.plot(ks, ys, label="silhouette score", ylabel = "silhouette score", xlabel = "number of clusters")
    display(p)

    ymax, ymax_index = findmax(ys)
    optimal_k = ks[ymax_index]
    clusterings = [Clustering.kmedoids(distance_matrix, optimal_k) for i in 1:10]
    max_value, max_value_index = findmax(clustering -> Statistics.mean(Clustering.silhouettes(clustering, distance_matrix)), clusterings)
    optimal_clustering = clusterings[max_value_index]
    # optimal_clustering.assignments
    min_medoid_value, min_medoid_index = findmin(optimal_clustering.medoids)
    indices_to_include = map(assignment -> assignment .!= min_medoid_index, optimal_clustering.assignments)
    # kmer_depth_of_coverage_bins
    threshold = Int(ceil(2^maximum(kmer_depth_of_coverage_bins[.!indices_to_include]))) + 1

    scale = 250
    p = Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), log_scale = log2, size=(2scale,scale), title="kmer frequencies")
    StatsPlots.vline!(p, log2.([threshold]))
    display(p)

    # find all vertices with count > threshold
    vertices_to_keep = [v for v in Graphs.vertices(graph) if (MetaGraphs.get_prop(graph, v, :count) > threshold)]
    # induce subgraph
    induced_subgraph, vertex_map = Graphs.induced_subgraph(graph, vertices_to_keep)

    # set kmer as indexing prop
    MetaGraphs.set_indexing_prop!(induced_subgraph, :kmer)
    return induced_subgraph
end

In [None]:
# Create a reference sequence
L = 1_000
seed = 0
# seed = 1
# seed = 2
# seed = 3
# seed = 4
# seed = 5
# seed = 6
# seed = 7
# seed = 8
# seed = 9
record = Mycelia.random_fasta_record(seed=seed, L=L)

In [None]:
error_rate = 0.1

In [None]:
observations_file = "$(DIR)/N-$L.observations.fasta"
observe([record], N=L, outfile=observations_file, error_rate=error_rate)

In [None]:
chosen_k = Mycelia.assess_kmer_saturation([observations_file]; outdir="$(observations_file).kmer-assessements")

In [None]:
T = BioSequences.BigDNAMer{chosen_k}
# T = BioSequences.BigDNAMer{61}

In [None]:
graph = Mycelia.fastx_to_kmer_graph(T, observations_file)

In [None]:
Mycelia.graph_to_gfa(graph, "$(observations_file).gfa")

In [None]:
FASTX.sequence(record)

In [None]:
join(map(kmer -> graph[kmer, :kmer], BioSequences.canonical.(BioSequences.each(T, FASTX.sequence(record)))), ',')

In [None]:
join(map(kmer -> string(graph[BioSequences.canonical(kmer), :kmer]) * (BioSequences.iscanonical(kmer.fw) ? '+' : '-'), collect(BioSequences.each(T, FASTX.sequence(record)))), ',')

In [None]:
filtered_graph = apply_kmedoids_treshold(graph)

In [None]:
seed = 0
core_path = find_graph_core(filtered_graph, seed=seed)
reconstruted_sequence = Mycelia.kmer_path_to_sequence(core_path)
Mycelia.is_equivalent(reconstruted_sequence, FASTX.sequence(record))

In [None]:
seed += 1
@show seed
core_path = find_graph_core(filtered_graph, seed=seed)
reconstruted_sequence = Mycelia.kmer_path_to_sequence(core_path)
Mycelia.is_equivalent(reconstruted_sequence, FASTX.sequence(record))