In [None]:
DATE = "2022-01-23"
TASK = "sample-core-genome"
DIR = mkpath("$(homedir())/workspace/$(DATE)-$(TASK)")
cd(DIR)

In [None]:
pkgs = [
    "Graphs",
    "MetaGraphs",
    "FileIO",
    "JLD2",
    "DataStructures",
    "BioSequences",
    "Random",
    "FASTX",
    "Revise",
    "StatsBase"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia

In [None]:
function kmer_path_to_sequence(kmer_path)
    sequence = BioSequences.LongDNASeq(first(kmer_path))
    for kmer in kmer_path[2:end]
        @assert kmer[1] == sequence[end-1]
        @assert kmer[2] == sequence[end]
        push!(sequence, kmer[end])
    end
    return sequence
end

In [None]:
# vertices should either be entire graph (by default) or a connected component
function find_graph_core(graph; vertices=collect(Graphs.vertices(graph)), seed=rand(Int))
    
    Random.seed!(seed)
    
    min_nodes = min(3, length(vertices))
    # sample top max(2, 10%) of nodes
    ten_percent = Int(round(length(vertices) * .10))
    n_nodes_to_sample = max(min_nodes, ten_percent)
    
    kmers = [MetaGraphs.get_prop(graph, v, :kmer) for v in sort(vertices)]
    counts = [MetaGraphs.get_prop(graph, :kmer_counts)[kmer] for kmer in kmers]
    selected_nodes = StatsBase.sample(kmers, StatsBase.weights(counts), n_nodes_to_sample, replace=false)
    
    selected_node_counts = [MetaGraphs.get_prop(graph, :kmer_counts)[kmer] for kmer in selected_nodes]
    starting_node = StatsBase.sample(selected_nodes, StatsBase.weights(selected_node_counts))
    current_walk = [starting_node]
    
    targets = setdiff(selected_nodes, current_walk)
    targets = Set(vcat(targets, BioSequences.reverse_complement.(targets)))
    
    prior_walk_length = length(current_walk)    
    done = isempty(targets)
    
    while !done
        forward_source = last(current_walk)
        reverse_source = BioSequences.reverse_complement(first(current_walk))
        forward_walk, forward_distance = Mycelia.dijkstra(graph, forward_source, targets, search_strategy=:DFS)
        reverse_walk, reverse_distance = Mycelia.dijkstra(graph, reverse_source, targets, search_strategy=:DFS)
        current_walk = vcat(reverse(BioSequences.reverse_complement.(reverse_walk))[1:end-1], current_walk, forward_walk[2:end])
        targets = setdiff(targets, current_walk)
        if isempty(targets)
#             @show "done because we've found all the targets"
            done = true
        elseif length(current_walk) == prior_walk_length 
#             @show "done because walk did not expand"
            done = true
        else
            prior_walk_length = length(current_walk)
        end
    end

    forward_source = last(current_walk)
    reverse_source = BioSequences.reverse_complement(first(current_walk))
    forward_walk, forward_distance = Mycelia.dijkstra(graph, forward_source, targets, search_strategy=:DFS)
    reverse_walk, reverse_distance = Mycelia.dijkstra(graph, reverse_source, targets, search_strategy=:DFS)
    current_walk = vcat(reverse(BioSequences.reverse_complement.(reverse_walk))[1:end-1], current_walk, forward_walk[2:end])    
    return current_walk
end     

In [None]:
# Create a reference sequence
refseq = BioSequences.randdnaseq(Random.seed!(2), 9)

In [None]:
altseq = copy(refseq)
altseq[5] = BioSequences.DNA_T
altseq

In [None]:
fasta_file = "$(DIR)/test.fasta"
open(fasta_file, "w") do io
    fastx_io = FASTX.FASTA.Writer(io)
    i = 1
    for x in 1:2
        write(fastx_io, FASTX.FASTA.Record("$i", refseq))
        i += 1
    end
    write(fastx_io, FASTX.FASTA.Record("$i", altseq))
    close(fastx_io)
end
println(read(fasta_file, String))

In [None]:
k = 3
T = BioSequences.DNAMer{3}

In [None]:
graph = Mycelia.fastx_to_kmer_graph(T, fasta_file)

In [None]:
Mycelia.plot_graph(graph)

In [None]:
seed = 0
core_path = find_graph_core(graph, seed=seed)

In [None]:
selected_vertices = map(canonical_kmer -> graph[canonical_kmer, :kmer], unique(BioSequences.canonical.(core_path)))
subgraph, vertex_map = Graphs.induced_subgraph(graph, selected_vertices)
Mycelia.plot_graph(subgraph)

In [None]:
reconstruted_sequence = kmer_path_to_sequence(core_path)

In [None]:
refseq

In [None]:
Mycelia.is_equivalent(reconstruted_sequence, refseq)

In [None]:
seed += 1
core_path = find_graph_core(graph, seed=seed)