In [1]:
DATE = "2021-09-06"
TASK = "annealing-correction"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

"/home/jovyan/2021-09-06-annealing-correction"

In [2]:
import Pkg
pkgs = [
    "BioAlignments",
    "BioSequences",
    "Clustering",
#     "CodecZlib",
#     "Colors",
#     "Combinatorics",
    "DataFrames",
    "DataStructures",
    "Dates",
#     "DelimitedFiles",
    "Distances",
#     "Distributions",
#     "EzXML",
    "FASTX",
#     "GFF3",
    "GraphPlot",
#     "HTTP",
#     "Impute",
#     "JSON",
    "LightGraphs",
#     "LSHFunctions",
#     "Measures",
    "MetaGraphs",
    "https://github.com/cjprybol/Mycelia.git",
#     "NumericIO",
#     "PlotlyJS",
#     "Plots",
    "Primes",
#     "Printf",
    "ProgressMeter",
    "Random",
    "Revise",
    "SparseArrays",
    "Statistics",
    "StatsBase",
    "StatsPlots",
#     "StringDistances",
    "uCSV",
#     "XLSX",
]

unregistered_packages = filter(pkg -> occursin(r"(^https|git$)", pkg), pkgs)
registered_packages = setdiff(pkgs, unregistered_packages)

for pkg in registered_packages
    try
        eval(Meta.parse("import $(pkg)"))
    catch
        Pkg.add(pkg)
        Pkg.build(pkg)
        eval(Meta.parse("import $(pkg)"))
    end
end

for pkg_url in unregistered_packages
    pkg_name = replace(basename(pkg_url), ".git" => "")
    try
        eval(Meta.parse("import $(pkg_name)"))
    catch
        Pkg.develop(url=pkg_url)
        Pkg.build(pkg_name)
        eval(Meta.parse("import $(pkg_name)"))
    end
end

In [3]:
function kmer_index_and_orientation_to_kmer(graph, kmer_index, orientation)
    kmer = graph.vprops[kmer_index][:kmer]
    if !orientation
        kmer = BioSequences.reverse_complement(kmer)
    end
    return kmer
end

kmer_index_and_orientation_to_kmer (generic function with 1 method)

In [4]:
function determine_edge_probabilities(graph)
    nv = LightGraphs.nv(graph)
    edge_probabilities = SparseArrays.spzeros(nv, nv)
    for v in 1:nv
        neighbors = LightGraphs.neighbors(graph, v)
        @assert issorted(neighbors)
        likelihoods = zeros(length(neighbors))
        for (i, neighbor) in enumerate(neighbors)
            if v <= neighbor
                edge = LightGraphs.Edge(v, neighbor)
            else
                edge = LightGraphs.Edge(neighbor, v)
            end
            @assert LightGraphs.has_edge(graph, edge)
            likelihoods[i] = graph.eprops[edge][:count]     
        end
        likelihoods = likelihoods ./ sum(likelihoods)
#         @show likelihoods
#         @show neighbors
        for (neighbor, likelihood) in zip(neighbors, likelihoods)
            edge_probabilities[v, neighbor] = likelihood
        end
    end
    
    for source in 1:size(edge_probabilities, 1)
        destinations = findall(edge_probabilities[source, :] .> 0)
        destination_counts = [graph.vprops[dest][:count] for dest in destinations]
        destination_likelihoods = destination_counts ./ sum(destination_counts)
        for (dest, likelihood) in zip(destinations, destination_likelihoods)
            edge_probabilities[source, dest] *= likelihood
        end
        edge_probabilities[source, :] ./= sum(edge_probabilities[source, :])
        @assert abs(1-sum(edge_probabilities[source, :])) <= eps(Float64)
    end
    
    return edge_probabilities
end

determine_edge_probabilities (generic function with 1 method)

In [5]:
function orient_path(graph, kmers, path, opening_orientation, opening_kmer, closing_orientation)
    vertices = path
    path_orientations = [opening_orientation]
    path_kmers = [opening_kmer]
    @show opening_orientation
    for (i, vertex) in enumerate(vertices[2:end])
        @show i, vertex, last(path_orientations)
        viable_neighbors = typeof(opening_kmer)[]
        viable_neighbor = nothing
        viable_orientations = Bool[]
        viable_orientation = nothing
        for neighbor in BioSequences.neighbors(last(path_kmers))
            @show "considering going from $(last(path_kmers)) to $(neighbor)"
#             @show "considering going from $(last(path_kmers)) to $(neighbor)"
            canonical_neighbor = BioSequences.canonical(neighbor)
            canonical_neighbor_index_range = searchsorted(kmers, canonical_neighbor)
#             @show canonical_neighbor_index_range
            if length(canonical_neighbor_index_range) == 1
                @show canonical_neighbor_index = first(canonical_neighbor_index_range)
                if canonical_neighbor_index == vertex
                    @show "hit, selecting $neighbor"
                    viable_neighbor = neighbor
                    viable_orientation = neighbor == canonical_neighbor
                end
            end
        end
        if (viable_neighbor != nothing) && (viable_orientation != nothing)
            push!(path_kmers, viable_neighbor)
            push!(path_orientations, viable_orientation)
        else
            return nothing
            # not a viable path, need to look towards the next miss
        end
    end
    return vertices, path_orientations, path_kmers
end

orient_path (generic function with 1 method)

In [6]:
function take_a_walk(graph, edge_likelihoods, kmers, kmer, walk_length)
    walk = [kmer]
    current_kmer_indices = searchsorted(kmers, BioSequences.canonical(last(walk)))
    @assert length(current_kmer_indices) == 1
    current_kmer_index = first(current_kmer_indices)
    while length(walk) < walk_length
        viable_neighbors = Tuple{Int, Bool}[]
        for neighbor in BioSequences.neighbors(kmer)
            canonical_neighbor = BioSequences.canonical(neighbor)
            neighbor_is_canonical = neighbor == canonical_neighbor
            neighbor_vertex_range = searchsorted(kmers, canonical_neighbor)
            if !isempty(neighbor_vertex_range)
                neighbor_vertex = first(neighbor_vertex_range)
                if LightGraphs.has_edge(graph, current_kmer_index, neighbor_vertex)
                    push!(viable_neighbors, (neighbor_vertex, neighbor_is_canonical))
                end
            end
        end
        if isempty(viable_neighbors)
            return walk
        elseif length(viable_neighbors) == 1
            chosen_neighbor = first(viable_neighbors)
        else
            viable_neighbor_indices = first.(viable_neighbors)
            step_likelihoods = StatsBase.weights(edge_likelihoods[current_kmer_index, viable_neighbor_indices])
            chosen_neighbor = StatsBase.sample(viable_neighbors, step_likelihoods)
        end
        kmer = kmer_index_and_orientation_to_kmer(graph, chosen_neighbor...)
        push!(walk, kmer)
        current_kmer_indices = searchsorted(kmers, BioSequences.canonical(last(walk)))
        @assert length(current_kmer_indices) == 1
        current_kmer_index = first(current_kmer_indices)
    end
    return walk
end

take_a_walk (generic function with 1 method)

In [7]:
# create a random genome

In [121]:
L = 10

10

In [122]:
k = 3

3

In [123]:
genome = BioSequences.randdnaseq(Random.seed!(L), L)

10nt DNA Sequence:
AGCCTGCAAA

In [124]:
kmer_counts = sort!(Mycelia.count_canonical_kmers(BioSequences.BigDNAMer{k}, genome))

OrderedCollections.OrderedDict{BioSequences.BigDNAMer{3}, Int64} with 7 entries:
  AAA => 1
  AGC => 1
  AGG => 1
  CAA => 1
  CAG => 1
  GCA => 2
  GCC => 1

In [125]:
K = length(keys(kmer_counts))

7

In [126]:
# create an undirected kmer graph from the sequence

In [127]:
graph = MetaGraphs.MetaGraph(K)

{7, 0} undirected Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [128]:
for (i, (kmer, count)) in enumerate(kmer_counts)
    @show i, kmer, count
    @show MetaGraphs.set_prop!(graph, i, :kmer, kmer)
    @show MetaGraphs.set_prop!(graph, i, :count, count)
end

(i, kmer, count) = (1, AAA, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (2, AGC, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (3, AGG, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (4, CAA, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (5, CAG, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (6, GCA, 2)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (7, GCC, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true


In [129]:
graph.vprops

Dict{Int64, Dict{Symbol, Any}} with 7 entries:
  5 => Dict(:count=>1, :kmer=>CAG)
  4 => Dict(:count=>1, :kmer=>CAA)
  6 => Dict(:count=>2, :kmer=>GCA)
  7 => Dict(:count=>1, :kmer=>GCC)
  2 => Dict(:count=>1, :kmer=>AGC)
  3 => Dict(:count=>1, :kmer=>AGG)
  1 => Dict(:count=>1, :kmer=>AAA)

In [130]:
kmers = collect(keys(kmer_counts))

7-element Vector{BioSequences.BigDNAMer{3}}:
 AAA
 AGC
 AGG
 CAA
 CAG
 GCA
 GCC

In [131]:
for i in 1:length(genome)-k
    edge_range = i:i+k
    edge = genome[edge_range]
    src = BioSequences.BigDNAMer{k}(edge[1:end-1])
    dst = BioSequences.BigDNAMer{k}(edge[2:end])
    canonical_src = BioSequences.canonical(src)
    canonical_dst = BioSequences.canonical(dst)
    src_index = Mycelia.get_kmer_index(kmers, canonical_src)
    dst_index = Mycelia.get_kmer_index(kmers, canonical_dst)
    @show edge
    @show src_index, src == canonical_src, dst_index, dst == canonical_dst
    graph_edge = LightGraphs.Edge(src_index, dst_index)
    if LightGraphs.has_edge(graph, graph_edge)
        current_count = graph.eprops[graph_edge][:count]
        MetaGraphs.set_prop!(graph, graph_edge, :count, current_count+1)
    else
        LightGraphs.add_edge!(graph, graph_edge)
        MetaGraphs.set_prop!(graph, graph_edge, :count, 1)
    end
end

edge = AGCC
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (2, true, 7, true)
edge = GCCT
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (7, true, 3, false)
edge = CCTG
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (3, false, 5, false)
edge = CTGC
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (5, false, 6, false)
edge = TGCA
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (6, false, 6, true)
edge = GCAA
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (6, true, 4, true)
edge = CAAA
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (4, true, 1, true)


In [132]:
graph

{7, 7} undirected Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [133]:
graph.eprops

Dict{LightGraphs.SimpleGraphs.SimpleEdge{Int64}, Dict{Symbol, Any}} with 7 entries:
  Edge 3 => 7 => Dict(:count=>1)
  Edge 5 => 6 => Dict(:count=>1)
  Edge 2 => 7 => Dict(:count=>1)
  Edge 6 => 6 => Dict(:count=>1)
  Edge 3 => 5 => Dict(:count=>1)
  Edge 4 => 6 => Dict(:count=>1)
  Edge 1 => 4 => Dict(:count=>1)

In [134]:
# GraphPlot.gplot(graph, nodesize=[rand(1:2) for i in 1:LightGraphs.nv(graph)])

In [135]:
edge_likelihoods = determine_edge_probabilities(graph)

7×7 SparseArrays.SparseMatrixCSC{Float64, Int64} with 13 stored entries:
  ⋅         ⋅    ⋅        1.0    ⋅     ⋅         ⋅ 
  ⋅         ⋅    ⋅         ⋅     ⋅     ⋅        1.0
  ⋅         ⋅    ⋅         ⋅    0.5    ⋅        0.5
 0.333333   ⋅    ⋅         ⋅     ⋅    0.666667   ⋅ 
  ⋅         ⋅   0.333333   ⋅     ⋅    0.666667   ⋅ 
  ⋅         ⋅    ⋅        0.25  0.25  0.5        ⋅ 
  ⋅        0.5  0.5        ⋅     ⋅     ⋅         ⋅ 

In [136]:
# take yen_k shortest paths

In [137]:
genome_kmers = collect(BioSequences.each(BioSequences.BigDNAMer{k}, genome))

8-element Vector{BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}}:
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(1, AGC, GCT)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(2, GCC, GGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(3, CCT, AGG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(4, CTG, CAG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(5, TGC, GCA)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(6, GCA, TGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(7, CAA, TTG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(8, AAA, TTT)

In [138]:
# make a SNP edit to the genome

In [153]:
# seed = 0
# seed = 1
# seed = 2
seed = 3

3

In [154]:
@show seed
Random.seed!(seed)
observation = Mycelia.observe(genome, error_rate = 0.1)

seed = 3


11nt DNA Sequence:
TTTGTCAGGCT

In [155]:
alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), observation, genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 9
  seq:  1 TTTGTCAG-GCT 11
             | | |    
  ref:  0 --AGCCTGCAAA 10


In [156]:
reverse_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), BioSequences.reverse_complement(observation), genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 1
  seq:  1 AGCCTGACAAA 11
          |||||| ||||
  ref:  1 AGCCTG-CAAA 10


In [157]:
if reverse_alignment.value < alignment.value
    observation = BioSequences.reverse_complement!(observation)
    alignment = reverse_alignment
    @show "flipping"
end

"flipping" = "flipping"


"flipping"

In [158]:
# convert genome into stranded path

In [159]:
# function sequence_to_
observation_as_oriented_kmers = []
observation_kmers = collect(BioSequences.each(BioSequences.BigDNAMer{k}, observation))

9-element Vector{BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}}:
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(1, AGC, GCT)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(2, GCC, GGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(3, CCT, AGG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(4, CTG, CAG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(5, TGA, TCA)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(6, GAC, GTC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(7, ACA, TGT)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(8, CAA, TTG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(9, AAA, TTT)

In [160]:
genome_kmers

8-element Vector{BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}}:
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(1, AGC, GCT)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(2, GCC, GGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(3, CCT, AGG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(4, CTG, CAG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(5, TGC, GCA)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(6, GCA, TGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(7, CAA, TTG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(8, AAA, TTT)

In [161]:
is_canonical = falses(length(observation_kmers))
kmer_index = zeros(Int, length(observation_kmers))
for (i, kmer) in enumerate(observation_kmers)
#     is_canonical[i] = kmer.fw <= kmer.bw
    canonical_kmer = BioSequences.canonical(kmer.fw)
    is_canonical[i] = kmer.fw == canonical_kmer
    kmer_index_range = searchsorted(kmers, canonical_kmer)
    if length(kmer_index_range) > 1
        @error "bad"
    elseif isempty(kmer_index_range)
        # do nothing, index is 0 to indicate not found
    else
        kmer_index[i] = first(kmer_index_range)
    end
end

In [162]:
is_canonical

9-element BitVector:
 1
 1
 0
 0
 0
 1
 1
 1
 1

In [163]:
kmer_index

9-element Vector{Int64}:
 2
 7
 3
 5
 0
 0
 0
 4
 1

In [169]:
proposed_path = BioSequences.BigDNAMer{k}[]
opening_missing_kmer_path_index = findfirst(x -> x == 0, kmer_index)
closing_solid_kmer_path_index = 0

if opening_missing_kmer_path_index == 1
    opening_solid_kmer = nothing
elseif opening_missing_kmer_path_index != nothing
    opening_solid_kmer_path_index = opening_missing_kmer_path_index - 1
    opening_solid_kmer = observation_kmers[opening_solid_kmer_path_index].fw
    closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_missing_kmer_path_index+1)
    if closing_solid_kmer_path_index == nothing
        @show "end is open"
        closing_solid_kmer = nothing
    else
        closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
    end
end

iterations = 0
while (opening_missing_kmer_path_index != nothing) && (iterations < 5)
    iterations += 1
    @show iterations

    if (opening_solid_kmer == nothing) && (closing_solid_kmer != nothing)
        @show "missing opening"
        @show closing_solid_kmer
        inverted_closing_solid_kmer = BioSequences.reverse_complement(closing_solid_kmer)
        @show inverted_closing_solid_kmer
        walk_length = closing_solid_kmer_path_index
        @show inverted_closing_solid_kmer
        chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, inverted_closing_solid_kmer, walk_length)
        chosen_walk = reverse(BioSequences.reverse_complement.(chosen_walk))
        @show chosen_walk
        @assert observation_kmers[closing_solid_kmer_path_index].fw == last(chosen_walk)
        proposed_path = chosen_walk
        opening_missing_kmer_path_index = findnext(map(x -> x == 0, kmer_index), closing_solid_kmer_path_index+1)
        
        if opening_missing_kmer_path_index == 1
            opening_solid_kmer = nothing
        elseif opening_missing_kmer_path_index != nothing
            opening_solid_kmer_path_index = opening_missing_kmer_path_index - 1
            opening_solid_kmer = observation_kmers[opening_solid_kmer_path_index].fw
            closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_missing_kmer_path_index+1)
            if closing_solid_kmer_path_index == nothing
                closing_solid_kmer = nothing
            else
                closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
            end
        end
    end
    
    if (opening_solid_kmer != nothing) && (closing_solid_kmer == nothing)
        @show "missing closing"
        walk_length = length(observation_kmers) - opening_solid_kmer_path_index+1
        chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length)
        @show chosen_walk
        @assert observation_kmers[opening_solid_kmer_path_index].fw == first(chosen_walk)
        proposed_path = getproperty.(observation_kmers[length(proposed_path)+1:opening_solid_kmer_path_index], :fw)
#         proposed_path = 
        append!(proposed_path, chosen_walk[2:end])
        opening_missing_kmer_path_index = nothing
    end
    
    while (opening_solid_kmer != nothing) && (closing_solid_kmer != nothing)
        @show "bubble!"
        opening_solid_kmer_index = kmer_index[opening_solid_kmer_path_index]
        closing_solid_kmer_index = kmer_index[closing_solid_kmer_path_index]
        if opening_solid_kmer_index != closing_solid_kmer_index
            path = LightGraphs.a_star(graph, opening_solid_kmer_index, closing_solid_kmer_index)
            normalized_path = Int[path[1].src, [edge.dst for edge in path]...]
            shortest_paths = [normalized_path]
        else
            neighbors = LightGraphs.neighbors(graph, opening_solid_kmer_index)
            shortest_paths = Vector{Vector{Int}}()
            for neighbor in neighbors
                path = LightGraphs.a_star(graph, neighbor, closing_solid_kmer_index)
                normalized_path = Int[path[1].src, [edge.dst for edge in path]...]
                push!(shortest_paths, normalized_path)
            end
        end
        walk_length = Int(ceil(max(closing_solid_kmer_path_index - opening_solid_kmer_path_index + 1, maximum(length.(shortest_paths))) * 1.1))
        walks = [take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length)]
        while allunique(walks)
            push!(walks, take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length))
        end
        @show walks
        walks = filter(path -> closing_solid_kmer in path, walks)
        @show walks
        if !isempty(walks)
            selected_path = rand(walks)
            selected_path = selected_path[1:findlast(kmer -> kmer == closing_solid_kmer, selected_path)]
            @show selected_path
            proposed_path = getproperty.(observation_kmers[length(proposed_path)+1:opening_solid_kmer_path_index], :fw)
            @show proposed_path
            append!(proposed_path, selected_path[2:end])
            @show proposed_path
            opening_missing_kmer_path_index = findnext(map(x -> x == 0, kmer_index), closing_solid_kmer_path_index+1)
            @show opening_missing_kmer_path_index
            if opening_missing_kmer_path_index == nothing
                #done
                opening_solid_kmer = nothing
            elseif opening_missing_kmer_path_index == 1
                opening_solid_kmer = nothing
            elseif opening_missing_kmer_path_index != nothing
                opening_solid_kmer_path_index = opening_missing_kmer_path_index - 1
                opening_solid_kmer = observation_kmers[opening_solid_kmer_path_index].fw
                closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_missing_kmer_path_index+1)
                if closing_solid_kmer_path_index == nothing
                    closing_solid_kmer = nothing
                else
                    closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
                end
            end
        else
            @show "need to bump out"
            if opening_missing_kmer_path_index >= 2
                opening_missing_kmer_path_index -= 1
                opening_solid_kmer = nothing
                # this should now break out to larger while loop
            end
            # push out closing node and try again
            closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), closing_solid_kmer_path_index+1)

            if closing_solid_kmer_path_index == nothing
                @show "end is open"
                closing_solid_kmer = nothing
            else
                closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
            end
        end
    end
end

iterations = 1
"bubble!" = "bubble!"
walks = Vector{BioSequences.BigDNAMer{3}}[[CTG, TGC, GCA, CAG, AGG, GGC], [CTG, TGC, GCA, CAA, AAA], [CTG, TGC, GCA, CAG, AGG, GGC]]
walks = Vector{BioSequences.BigDNAMer{3}}[[CTG, TGC, GCA, CAA, AAA]]
selected_path = BioSequences.BigDNAMer{3}[CTG, TGC, GCA, CAA]
proposed_path = BioSequences.BigDNAMer{3}[AGC, GCC, CCT, CTG]
proposed_path = BioSequences.BigDNAMer{3}[AGC, GCC, CCT, CTG, TGC, GCA, CAA]
opening_missing_kmer_path_index = nothing


In [170]:
if closing_solid_kmer_path_index != nothing
    append!(proposed_path, getproperty.(observation_kmers[closing_solid_kmer_path_index+1:end], :fw))
end

8-element Vector{BioSequences.BigDNAMer{3}}:
 AGC
 GCC
 CCT
 CTG
 TGC
 GCA
 CAA
 AAA

In [171]:
proposed_seq = BioSequences.LongDNASeq(first(proposed_path))

3nt DNA Sequence:
AGC

In [172]:
for kmer in proposed_path[2:end]
    push!(proposed_seq, last(kmer))
end
proposed_seq

10nt DNA Sequence:
AGCCTGCAAA

In [173]:
new_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), proposed_seq, genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 0
  seq:  1 AGCCTGCAAA 10
          ||||||||||
  ref:  1 AGCCTGCAAA 10


In [174]:
reverse_new_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), BioSequences.reverse_complement(proposed_seq), genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 8
  seq:  0 --TTTGCAGGCT 10
              ||||    
  ref:  1 AGCCTGCA--AA 10


In [175]:
if reverse_new_alignment.value < new_alignment.value
#     observation = BioSequences.reverse_complement!(observation)
    new_alignment = reverse_new_alignment
    @show "flipping"
end

In [176]:
@assert new_alignment.value <= alignment.value