In [None]:
DATE = "2022-01-26"
TASK = "sample-core-genome-circular-recycle"
DIR = mkpath("$(homedir())/workspace/$(DATE)-$(TASK)")
cd(DIR)

In [None]:
pkgs = [
    "Graphs",
    "MetaGraphs",
    "FileIO",
    "JLD2",
    "DataStructures",
    "BioSequences",
    "Random",
    "FASTX",
    "Revise",
    "StatsBase"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia

In [None]:
function kmer_path_to_sequence(kmer_path)
    sequence = BioSequences.LongDNASeq(first(kmer_path))
    for kmer in kmer_path[2:end]
        @assert kmer[1] == sequence[end-1]
        @assert kmer[2] == sequence[end]
        push!(sequence, kmer[end])
    end
    return sequence
end

In [None]:
function update_remaining_targets(current_walk::AbstractVector{T}, remaining_targets::AbstractSet{T}) where T <: BioSequences.AbstractMer
    # assess whether targets have been hit in the canonical space
    remaining_targets = setdiff(BioSequences.canonical.(remaining_targets), BioSequences.canonical.(current_walk))
    # blow back out into forward and reverse_complement space
    remaining_targets = Set{T}(vcat(remaining_targets, BioSequences.reverse_complement.(remaining_targets)))
    return remaining_targets
end

In [None]:
# vertices should either be entire graph (by default) or a connected component
# if people want to work on just the connected component, let them induce a subgraph
function find_graph_core(graph; seed=rand(Int))
    
    Random.seed!(seed)
    
#     T = typeof(MetaGraphs.get_prop(graph, 1, :kmer))
    
    selected_nodes = Set([
        MetaGraphs.get_prop(graph, i, :kmer) for (i, v) in enumerate(Graphs.vertices(graph))
            if Graphs.degree(graph, v) >= 3])
    # take the hub nodes!
#     selected_nodes = [
#         MetaGraphs.get_prop(graph, i, :kmer) for (i, v) in enumerate(Graphs.vertices(graph))
#             if Graphs.degree(graph, v) >= 3]
#     selected_node_counts = [MetaGraphs.get_prop(graph, :kmer_counts)[kmer] for kmer in selected_nodes]
#     max_value, index = findmax(selected_node_counts)
#     starting_node = selected_nodes[index]
#     starting_node = StatsBase.sample(selected_nodes, StatsBase.weights(selected_node_counts))
    
    max_kmer_count = MetaGraphs.get_prop(graph, 1, :count)
    starting_kmer = MetaGraphs.get_prop(graph, 1, :kmer)
    for i in 2:Graphs.nv(graph)
        this_count = MetaGraphs.get_prop(graph, i, :count)
        if this_count > max_kmer_count
            max_kmer_count = this_count
            starting_kmer = MetaGraphs.get_prop(graph, i, :kmer)
        end
    end
    
    current_walk = [starting_kmer]
    prior_walk_length = length(current_walk)
    
    remaining_targets = update_remaining_targets(current_walk, selected_nodes)
#     @show current_walk
#     @show remaining_targets
    
#     targets = setdiff(selected_nodes, current_walk)
#     remaining_targets = Set{T}(vcat(targets, BioSequences.reverse_complement.(targets)))
    done = isempty(remaining_targets)
    
    while !done
        forward_source = last(current_walk)
        reverse_source = BioSequences.reverse_complement(first(current_walk))
        
        forward_walk, forward_distance = Mycelia.dijkstra(graph, forward_source, remaining_targets, search_strategy=:DFS)
#         current_walk = vcat(current_walk, forward_walk[2:end])
#         remaining_targets = update_remaining_targets(current_walk, remaining_targets)
#         @show current_walk
#         @show remaining_targets
        reverse_walk, reverse_distance = Mycelia.dijkstra(graph, reverse_source, remaining_targets, search_strategy=:DFS)
#         current_walk = vcat(reverse(BioSequences.reverse_complement.(reverse_walk))[1:end-1], current_walk)
        current_walk = vcat(reverse(BioSequences.reverse_complement.(reverse_walk))[1:end-1], current_walk, forward_walk[2:end])
        remaining_targets = update_remaining_targets(current_walk, remaining_targets)

#         @show current_walk
#         @show remaining_targets
        
#         # assess whether targets have been hit in the canonical space
#         remaining_targets = setdiff(BioSequences.canonical.(remaining_targets), BioSequences.canonical.(current_walk))
#         # blow back out into forward and reverse_complement space
#         remaining_targets = Set{T}(vcat(remaining_targets, BioSequences.reverse_complement.(remaining_targets)))    
        failed_this_expansion = length(current_walk) == prior_walk_length
        prior_walk_length = length(current_walk)
        
        if isempty(remaining_targets)
            done = true
        elseif failed_this_expansion
            done = true
        end
    end

#     @show current_walk
    # if we haven't closed a loop, try and walk out to ends of contigs/chromosomes
    if first(current_walk) != last(current_walk)
        forward_source = last(current_walk)
        reverse_source = BioSequences.reverse_complement(first(current_walk))
        forward_walk, forward_distance = Mycelia.dijkstra(graph, forward_source, remaining_targets, search_strategy=:DFS)
        reverse_walk, reverse_distance = Mycelia.dijkstra(graph, reverse_source, remaining_targets, search_strategy=:DFS)
        current_walk = vcat(reverse(BioSequences.reverse_complement.(reverse_walk))[1:end-1], current_walk, forward_walk[2:end])
    end
    return current_walk
end     

In [None]:
# Create a reference sequence
refseq = push!(BioSequences.randdnaseq(Random.seed!(2), 9), BioSequences.DNA_C)

In [None]:
altseq = copy(refseq)
altseq[5] = BioSequences.DNA_T
altseq

In [None]:
fasta_file = "$(DIR)/test.fasta"
open(fasta_file, "w") do io
    fastx_io = FASTX.FASTA.Writer(io)
    i = 1
    for x in 1:2
        write(fastx_io, FASTX.FASTA.Record("$i", refseq))
        i += 1
    end
    write(fastx_io, FASTX.FASTA.Record("$i", altseq))
    close(fastx_io)
end
println(read(fasta_file, String))

In [None]:
k = 3
T = BioSequences.BigDNAMer{3}

In [None]:
graph = Mycelia.fastx_to_kmer_graph(T, fasta_file)

In [None]:
Mycelia.plot_graph(graph)

In [None]:
seed = 0
core_path = find_graph_core(graph, seed=seed)

In [None]:
selected_vertices = map(canonical_kmer -> graph[canonical_kmer, :kmer], unique(BioSequences.canonical.(core_path)))
subgraph, vertex_map = Graphs.induced_subgraph(graph, selected_vertices)
Mycelia.plot_graph(subgraph)

In [None]:
reconstruted_sequence = kmer_path_to_sequence(core_path)

In [None]:
refseq

In [None]:
seed += 1
@show seed
core_path = find_graph_core(graph, seed=seed)