In [None]:
DATE = "2021-09-15"
TASK = "sequencing-saturation"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

In [None]:
import Pkg
pkgs = [
    "BioAlignments",
    "BioSequences",
    "Clustering",
    "CodecZlib",
#     "Colors",
#     "Combinatorics",
    "DataFrames",
    "DataStructures",
    "Dates",
#     "DelimitedFiles",
    "Distances",
#     "Distributions",
#     "EzXML",
    "FASTX",
#     "GFF3",
    "GLM",
    "GraphPlot",
#     "HTTP",
#     "Impute",
#     "JSON",
    "LightGraphs",
    "LsqFit",
#     "LSHFunctions",
#     "Measures",
    "MetaGraphs",
    "https://github.com/cjprybol/Mycelia.git",
#     "NumericIO",
#     "PlotlyJS",
#     "Plots",
    "Primes",
#     "Printf",
    "ProgressMeter",
    "Random",
    "Revise",
    "SparseArrays",
    "Statistics",
    "StatsBase",
    "StatsPlots",
#     "StringDistances",
    "uCSV",
#     "XLSX",
]

unregistered_packages = filter(pkg -> occursin(r"(^https|git$)", pkg), pkgs)
registered_packages = setdiff(pkgs, unregistered_packages)

for pkg in registered_packages
    try
        eval(Meta.parse("import $(pkg)"))
    catch
        Pkg.add(pkg)
        Pkg.build(pkg)
        eval(Meta.parse("import $(pkg)"))
    end
end

for pkg_url in unregistered_packages
    pkg_name = replace(basename(pkg_url), ".git" => "")
    try
        eval(Meta.parse("import $(pkg_name)"))
    catch
        Pkg.develop(url=pkg_url)
        Pkg.build(pkg_name)
        eval(Meta.parse("import $(pkg_name)"))
    end
end

In [None]:
function kmer_index_and_orientation_to_kmer(graph, kmer_index, orientation)
    kmer = graph.vprops[kmer_index][:kmer]
    if !orientation
        kmer = BioSequences.reverse_complement(kmer)
    end
    return kmer
end

In [None]:
function determine_edge_probabilities(graph)
    nv = LightGraphs.nv(graph)
    edge_probabilities = SparseArrays.spzeros(nv, nv)
    for v in 1:nv
        neighbors = LightGraphs.neighbors(graph, v)
        @assert issorted(neighbors)
        likelihoods = zeros(length(neighbors))
        for (i, neighbor) in enumerate(neighbors)
            if v <= neighbor
                edge = LightGraphs.Edge(v, neighbor)
            else
                edge = LightGraphs.Edge(neighbor, v)
            end
            @assert LightGraphs.has_edge(graph, edge)
            likelihoods[i] = graph.eprops[edge][:count]     
        end
        likelihoods = likelihoods ./ sum(likelihoods)
#         @show likelihoods
#         @show neighbors
        for (neighbor, likelihood) in zip(neighbors, likelihoods)
            edge_probabilities[v, neighbor] = likelihood
        end
    end
    
    for source in 1:size(edge_probabilities, 1)
        destinations = findall(edge_probabilities[source, :] .> 0)
        destination_counts = [graph.vprops[dest][:count] for dest in destinations]
        destination_likelihoods = destination_counts ./ sum(destination_counts)
        for (dest, likelihood) in zip(destinations, destination_likelihoods)
            edge_probabilities[source, dest] *= likelihood
        end
        edge_probabilities[source, :] ./= sum(edge_probabilities[source, :])
        @assert abs(1-sum(edge_probabilities[source, :])) <= eps(Float64)
    end
    
    return edge_probabilities
end

In [None]:
function orient_path(graph, kmers, path, opening_orientation, opening_kmer, closing_orientation)
    vertices = path
    path_orientations = [opening_orientation]
    path_kmers = [opening_kmer]
    @show opening_orientation
    for (i, vertex) in enumerate(vertices[2:end])
        @show i, vertex, last(path_orientations)
        viable_neighbors = typeof(opening_kmer)[]
        viable_neighbor = nothing
        viable_orientations = Bool[]
        viable_orientation = nothing
        for neighbor in BioSequences.neighbors(last(path_kmers))
            @show "considering going from $(last(path_kmers)) to $(neighbor)"
#             @show "considering going from $(last(path_kmers)) to $(neighbor)"
            canonical_neighbor = BioSequences.canonical(neighbor)
            canonical_neighbor_index_range = searchsorted(kmers, canonical_neighbor)
#             @show canonical_neighbor_index_range
            if length(canonical_neighbor_index_range) == 1
                @show canonical_neighbor_index = first(canonical_neighbor_index_range)
                if canonical_neighbor_index == vertex
                    @show "hit, selecting $neighbor"
                    viable_neighbor = neighbor
                    viable_orientation = neighbor == canonical_neighbor
                end
            end
        end
        if (viable_neighbor != nothing) && (viable_orientation != nothing)
            push!(path_kmers, viable_neighbor)
            push!(path_orientations, viable_orientation)
        else
            return nothing
            # not a viable path, need to look towards the next miss
        end
    end
    return vertices, path_orientations, path_kmers
end

In [None]:
function take_a_walk(graph, edge_likelihoods, kmers, kmer, walk_length)
    walk = [kmer]
    current_kmer_indices = searchsorted(kmers, BioSequences.canonical(last(walk)))
    @assert length(current_kmer_indices) == 1
    current_kmer_index = first(current_kmer_indices)
    while length(walk) < walk_length
        viable_neighbors = Tuple{Int, Bool}[]
        for neighbor in BioSequences.neighbors(kmer)
            canonical_neighbor = BioSequences.canonical(neighbor)
            neighbor_is_canonical = neighbor == canonical_neighbor
            neighbor_vertex_range = searchsorted(kmers, canonical_neighbor)
            if !isempty(neighbor_vertex_range)
                neighbor_vertex = first(neighbor_vertex_range)
                if LightGraphs.has_edge(graph, current_kmer_index, neighbor_vertex)
                    push!(viable_neighbors, (neighbor_vertex, neighbor_is_canonical))
                end
            end
        end
        if isempty(viable_neighbors)
            return walk
        elseif length(viable_neighbors) == 1
            chosen_neighbor = first(viable_neighbors)
        else
            viable_neighbor_indices = first.(viable_neighbors)
            step_likelihoods = StatsBase.weights(edge_likelihoods[current_kmer_index, viable_neighbor_indices])
            chosen_neighbor = StatsBase.sample(viable_neighbors, step_likelihoods)
        end
        kmer = kmer_index_and_orientation_to_kmer(graph, chosen_neighbor...)
        push!(walk, kmer)
        current_kmer_indices = searchsorted(kmers, BioSequences.canonical(last(walk)))
        @assert length(current_kmer_indices) == 1
        current_kmer_index = first(current_kmer_indices)
    end
    return walk
end

In [None]:
"""
    sort_fastq(input_fastq, output_fastq)

This turns a 4-line FASTQ entry into a single tab separated line,
adds a column with the length of each read, passes it to Unix sort,
removes the length column, and converts it back into a FASTQ file.

sorts longest to shortest!!

http://thegenomefactory.blogspot.com/2012/11/sorting-fastq-files-by-sequence-length.html
"""
function sort_fastq(input_fastq, output_fastq="")
    
    if endswith(input_fastq, ".gz")
        p = pipeline(
                `gzip -dc $input_fastq`,
                `paste - - - -`,
                `perl -ne '@x=split m/\t/; unshift @x, length($x[1]); print join "\t",@x;'`,
                `sort -nr`,
                `cut -f2-`,
                `tr "\t" "\n"`,
                `gzip`
                )
    else
        p = pipeline(
                `cat $input_fastq`,
                `paste - - - -`,
                `perl -ne '@x=split m/\t/; unshift @x, length($x[1]); print join "\t",@x;'`,
                `sort -nr`,
                `cut -f2-`,
                `tr "\t" "\n"`
                )
    end
    run(pipeline(p, output_fastq))
    return output_fastq
end

In [None]:
function count_reads(fastq_file)
    @info "determing # of reads in fastq file"
    n_reads = 0
    for record in Mycelia.open_fastx(fastq_file)
        n_reads += 1
    end
    @show n_reads
    return n_reads
end

In [None]:
function determine_read_lengths(fastq_file; total_reads = Inf)
    if total_reads == Inf
        total_reads = count_reads(fastq_file)
    end
    read_lengths = zeros(Int, total_reads)
    @info "determining read lengths"
    p = ProgressMeter.Progress(total_reads, 1)
    for (i, record) in enumerate(Mycelia.open_fastx(fastq_file))
#         push!(read_lengths, length(FASTX.sequence(record)))
        read_lengths[i] = length(FASTX.sequence(record))
        ProgressMeter.next!(p)
    end
    return read_lengths
end

In [None]:
function max_canonical_kmers(kmer_type)
    k_size = last(kmer_type.parameters)
    # we only consider canonical kmers so cut in 1/2
    max_canonical_kmers = Int(4^k_size / 2)
    return max_canonical_kmers
end

In [None]:
function assess_kmer_saturation(fastq_file, kmer_type; kmers_to_assess=Inf, power=10)
    canonical_kmers = Set{kmer_type}()
    
    max_possible_kmers = max_canonical_kmers(kmer_type)
    
    if kmers_to_assess == Inf
        kmers_to_assess = max_possible_kmers
    end
    
    sampling_points = Int[0]
    i = 0
    while power^i <= kmers_to_assess
        push!(sampling_points, power^i)
        i += 1
    end
    
    unique_kmer_counts = zeros(Int, length(sampling_points))
    
    if length(sampling_points) < 3
        @info "increase the # of reads analyzed or decrease the power to acquire more data points"
        return (;sampling_points, unique_kmer_counts)
    end
    
    p = ProgressMeter.Progress(kmers_to_assess, 1)
    
    kmers_assessed = 0
    for record in Mycelia.open_fastx(fastq_file)
        for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
            unique_kmers_count = length(canonical_kmers)
            if (kmers_assessed == kmers_to_assess) || (unique_kmers_count == max_possible_kmers)
                sampling_points = vcat(filter(s -> s < kmers_to_assess, sampling_points), [kmers_assessed])
                unique_kmer_counts = vcat(unique_kmer_counts[1:length(sampling_points)-1], [unique_kmers_count])
                return (;sampling_points, unique_kmer_counts)
            elseif rem(log(power, kmers_assessed), 1) == 0.0
                i = findfirst(sampling_points .== kmers_assessed)
                unique_kmer_counts[i] = unique_kmers_count
                percent_saturation = round(unique_kmers_count/max_possible_kmers, sigdigits=5) * 100
#                 @show kmers_assessed, percent_saturation
            end
            canonical_kmer = kmer.fw < kmer.bw ? kmer.fw : kmer.bw
            push!(canonical_kmers, canonical_kmer)
            kmers_assessed += 1
            ProgressMeter.next!(p)
        end
    end
    return (;sampling_points, unique_kmer_counts)
end

In [None]:
# Michaelis–Menten
function calculate_v(s,p)
    vmax = p[1]
    km = p[2]
    v = (vmax .* s) ./ (km .+ s)
    return v
end

In [None]:
# read in reads from pacbio run

In [None]:
# fastq_directory = "$(homedir())/pacbio-test-data"
# fastq_file = first(readdir(fastq_directory, join=true))
fastq_file = "$(homedir())/pacbio-test-data/b002225.subreads.fastq.gz"

In [None]:
# sort fastq file from shortest to longest reads

In [None]:
sorted_fastq_file = replace(fastq_file, r"\.fastq\.gz$" => ".sorted.fastq.gz")
# @time sort_fastq(fastq_file, sorted_fastq_file)
# 62 minutes!

In [None]:
# @time total_reads = count_reads(sorted_fastq_file)
total_reads = 1970799

In [None]:
@time read_lengths = determine_read_lengths(sorted_fastq_file, total_reads = total_reads)

In [None]:
power = 2

In [None]:
ks = Primes.primes(3, 31)
minimum_k = first(ks)
minimum_saturation = Inf
midpoint = Inf
for k in ks
    kmer_type = BioSequences.DNAMer{k}
    kmers_per_read = read_lengths .- k .+ 1
    total_kmers = sum(kmers_per_read)
    kmers_to_assess = minimum([10_000_000, max_canonical_kmers(kmer_type), Int(ceil(total_kmers * .01))])
    sampling_points, kmer_counts = assess_kmer_saturation(sorted_fastq_file, kmer_type, kmers_to_assess=kmers_to_assess, power=power)
    observed_midpoint_index = findfirst(i -> kmer_counts[i] > last(kmer_counts)/2, 1:length(sampling_points))
    observed_midpoint = sampling_points[observed_midpoint_index]
    initial_parameters = Float64[maximum(kmer_counts), observed_midpoint]
    @time fit = LsqFit.curve_fit(calculate_v, sampling_points, kmer_counts, initial_parameters)
    inferred_kmer_count = max(Int(ceil(fit.param[1])), last(kmer_counts))
    inferred_midpoint = Int(ceil(fit.param[2]))
    predicted_saturation = inferred_kmer_count / max_canonical_kmers(kmer_type)
    @show k, predicted_saturation
    
    p = StatsPlots.scatter(
        sampling_points,
        kmer_counts,
        label="observed kmer counts",
        ylabel="# unique kmers",
        xlabel="# kmers assessed",
        title = "sequencing saturation @ k = $k",
        legend=:outertopright,
        size=(800, 400)
        )
    StatsPlots.hline!(p, [max_canonical_kmers(kmer_type)], label="absolute maximum")
    StatsPlots.hline!(p, [inferred_kmer_count], label="inferred maximum")
    StatsPlots.vline!(p, [inferred_midpoint], label="inferred midpoint")
    # xs = vcat(sampling_points, [last(sampling_points) * 2^i for i in 1:2])
    xs = sort([sampling_points..., inferred_midpoint])
    ys = calculate_v(xs, fit.param)
    StatsPlots.plot!(
        p,
        xs,
        ys,
        label="fit trendline")
    display(p)
    
    if predicted_saturation < minimum_saturation
        minimum_saturation = predicted_saturation
        minimum_k = k
        midpoint = inferred_midpoint 
    end
    if predicted_saturation < 0.1
        break
    end
end

In [None]:
midpoint

In [None]:
k = minimum_k

In [None]:
kmer_type = BioSequences.DNAMer{k}
kmers_per_read = read_lengths .- k .+ 1
total_kmers = sum(kmers_per_read)

In [None]:
n_kmers = 0
reads_to_write = 0
for (i, kmers_in_read) in enumerate(kmers_per_read)
    n_kmers += kmers_in_read
#     if n_kmers > (midpoint * 2)
    if n_kmers > (midpoint)
        @show reads_to_write = i
        break
    end
end

In [None]:
# downsample reads to desired depth

In [None]:
subset_fastq_file = replace(sorted_fastq_file, ".sorted.fastq.gz" => ".sorted.subset.fastq.gz")
# @time sort_fastq(fastq_file, sorted_fastq_file)
# 62 minutes!

In [None]:
io = CodecZlib.GzipCompressorStream(open(subset_fastq_file, "w"))
fastx_io = FASTX.FASTQ.Writer(io)
for (i, record) in enumerate(Mycelia.open_fastx(sorted_fastq_file))
    if i <= reads_to_write
        write(fastx_io, record)
    else
        break
    end
end
close(fastx_io)
close(io)

In [None]:
determine_read_lengths(subset_fastq_file)

In [None]:
# plot kmer spectra frequency

In [None]:
kmer_counts = sort!(Mycelia.count_canonical_kmers(BioSequences.BigDNAMer{k}, genome))

In [None]:
K = length(keys(kmer_counts))

In [None]:
# create an undirected kmer graph from the sequence

In [None]:
graph = MetaGraphs.MetaGraph(K)

In [None]:
for (i, (kmer, count)) in enumerate(kmer_counts)
    @show i, kmer, count
    @show MetaGraphs.set_prop!(graph, i, :kmer, kmer)
    @show MetaGraphs.set_prop!(graph, i, :count, count)
end

In [None]:
graph.vprops

In [None]:
kmers = collect(keys(kmer_counts))

In [None]:
for i in 1:length(genome)-k
    edge_range = i:i+k
    edge = genome[edge_range]
    src = BioSequences.BigDNAMer{k}(edge[1:end-1])
    dst = BioSequences.BigDNAMer{k}(edge[2:end])
    canonical_src = BioSequences.canonical(src)
    canonical_dst = BioSequences.canonical(dst)
    src_index = Mycelia.get_kmer_index(kmers, canonical_src)
    dst_index = Mycelia.get_kmer_index(kmers, canonical_dst)
    @show edge
    @show src_index, src == canonical_src, dst_index, dst == canonical_dst
    graph_edge = LightGraphs.Edge(src_index, dst_index)
    if LightGraphs.has_edge(graph, graph_edge)
        current_count = graph.eprops[graph_edge][:count]
        MetaGraphs.set_prop!(graph, graph_edge, :count, current_count+1)
    else
        LightGraphs.add_edge!(graph, graph_edge)
        MetaGraphs.set_prop!(graph, graph_edge, :count, 1)
    end
end

In [None]:
graph

In [None]:
graph.eprops

In [None]:
nodesize = [graph.vprops[v][:count] for v in 1:LightGraphs.nv(graph)]
nodelabel = [graph.vprops[v][:kmer] for v in 1:LightGraphs.nv(graph)]
# GraphPlot.gplot(graph, nodesize=nodesize, nodelabel=nodelabel)
GraphPlot.gplot(graph, nodesize=nodesize)

In [None]:
edge_likelihoods = determine_edge_probabilities(graph)

In [None]:
# take yen_k shortest paths

In [None]:
genome_kmers = collect(BioSequences.each(BioSequences.BigDNAMer{k}, genome))

In [None]:
# make a SNP edit to the genome

In [None]:
# seed = 0
# seed = 1
# seed = 2
# seed = 3
# seed = 4
# seed = 5
# seed = 6
# seed = 7
# seed = 8
seed = 9

In [None]:
@show seed
Random.seed!(seed)
observation = Mycelia.observe(genome, error_rate = 0.1)

In [None]:
alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), observation, genome)

In [None]:
reverse_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), BioSequences.reverse_complement(observation), genome)

In [None]:
if reverse_alignment.value < alignment.value
    observation = BioSequences.reverse_complement(observation)
    alignment = reverse_alignment
    @show "flipping"
end

In [None]:
# convert genome into stranded path

In [None]:
# function sequence_to_
observation_as_oriented_kmers = []
observation_kmers = collect(BioSequences.each(BioSequences.BigDNAMer{k}, observation))

In [None]:
genome_kmers

In [None]:
is_canonical = falses(length(observation_kmers))
kmer_index = zeros(Int, length(observation_kmers))
for (i, kmer) in enumerate(observation_kmers)
#     is_canonical[i] = kmer.fw <= kmer.bw
    canonical_kmer = BioSequences.canonical(kmer.fw)
    is_canonical[i] = kmer.fw == canonical_kmer
    kmer_index_range = searchsorted(kmers, canonical_kmer)
    if length(kmer_index_range) > 1
        @error "bad"
    elseif isempty(kmer_index_range)
        # do nothing, index is 0 to indicate not found
    else
        kmer_index[i] = first(kmer_index_range)
    end
end

In [None]:
is_canonical

In [None]:
kmer_index

In [None]:
proposed_path = BioSequences.BigDNAMer{k}[]
opening_missing_kmer_path_index = findfirst(x -> x == 0, kmer_index)

# read is solid!
if opening_missing_kmer_path_index == nothing
#     proposed_seq = observation
    proposed_path = [kmer.fw for kmer in observation_kmers]
    @show "perfect read"
elseif all(index -> index == 0, kmer_index)
    @show "read is trash"
    # generate a new read from the assembly to replace this one

    observation_path_length = length(observation_kmers)
    proposed_path = nothing
    proposed_path_size_discrepancy = Inf
    starting_kmer_indices = StatsBase.sample(1:LightGraphs.nv(graph), min(10, LightGraphs.nv(graph)), replace=false)
    for starting_kmer_index in starting_kmer_indices
        starting_kmer_index = rand(1:LightGraphs.nv(graph))
        @show starting_kmer_index
        starting_kmer = graph.vprops[starting_kmer_index][:kmer]
        if rand(Bool)
            starting_kmer = BioSequences.reverse_complement(starting_kmer)
        end
        candidate_path = take_a_walk(graph, edge_likelihoods, kmers, starting_kmer, observation_path_length)
        discrepancy = abs(length(candidate_path) - observation_path_length)
        @show discrepancy
        if discrepancy < proposed_path_size_discrepancy
            proposed_path = candidate_path
            proposed_path_size_discrepancy = discrepancy
            if proposed_path_size_discrepancy == 0
                break
            end
        end
    end
else
    if opening_missing_kmer_path_index == 1
        opening_solid_kmer = nothing
    #     closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_missing_kmer_path_index+1)
    elseif opening_missing_kmer_path_index != nothing
        opening_solid_kmer_path_index = opening_missing_kmer_path_index - 1
        opening_solid_kmer = observation_kmers[opening_solid_kmer_path_index].fw
    end

    if opening_missing_kmer_path_index == nothing
        closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), 1)
    else
        closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_missing_kmer_path_index+1)
    end
    if closing_solid_kmer_path_index == nothing
        @show "end is open"
        closing_solid_kmer = nothing
    else
        closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
    end


    extensions = 0
    while (opening_missing_kmer_path_index != nothing) && (extensions < 5)
        extensions += 1
        @show extensions

        if (opening_solid_kmer == nothing) && (closing_solid_kmer != nothing)
            @show "missing opening"
            @show closing_solid_kmer
            inverted_closing_solid_kmer = BioSequences.reverse_complement(closing_solid_kmer)
            @show inverted_closing_solid_kmer
            walk_length = closing_solid_kmer_path_index
            chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, inverted_closing_solid_kmer, walk_length)
            chosen_walk = reverse(BioSequences.reverse_complement.(chosen_walk))
            @show chosen_walk
            @show closing_solid_kmer_path_index
            @assert observation_kmers[closing_solid_kmer_path_index].fw == last(chosen_walk)
            proposed_path = chosen_walk
            opening_missing_kmer_path_index = findnext(map(x -> x == 0, kmer_index), closing_solid_kmer_path_index+1)

            if opening_missing_kmer_path_index == 1
                opening_solid_kmer = nothing
            elseif opening_missing_kmer_path_index != nothing
                need_to_add = getproperty.(observation_kmers[closing_solid_kmer_path_index+1:opening_missing_kmer_path_index-1], :fw)
                append!(proposed_path, need_to_add)
                opening_solid_kmer_path_index = opening_missing_kmer_path_index - 1
                opening_solid_kmer = observation_kmers[opening_solid_kmer_path_index].fw
                closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_missing_kmer_path_index+1)
                if closing_solid_kmer_path_index == nothing
                    closing_solid_kmer = nothing
                else
                    closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
                end
            end
        end

        if (opening_solid_kmer != nothing) && (closing_solid_kmer == nothing)
            @show "missing closing"
            walk_length = length(observation_kmers) - opening_solid_kmer_path_index+1
            chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length)
            @show chosen_walk
            @assert observation_kmers[opening_solid_kmer_path_index].fw == first(chosen_walk)
            if isempty(proposed_path)
                proposed_path = getproperty.(observation_kmers[1:opening_solid_kmer_path_index], :fw)
            end
            append!(proposed_path, chosen_walk[2:end])
            opening_missing_kmer_path_index = nothing
        end

        while (opening_solid_kmer != nothing) && (closing_solid_kmer != nothing)
            @show "bubble!"
            opening_solid_kmer_index = kmer_index[opening_solid_kmer_path_index]
            closing_solid_kmer_index = kmer_index[closing_solid_kmer_path_index]

            @show opening_solid_kmer, opening_solid_kmer_path_index
            @show closing_solid_kmer, closing_solid_kmer_path_index
            @show length(proposed_path)
            
            #####################################################################################
            # DETERMINE PATH LENGTH
            #####################################################################################
            # by default, we're assuming that we want to walk the same distance as our bubble + 1
            bubble_path_length = closing_solid_kmer_path_index - opening_solid_kmer_path_index + 1

            if opening_solid_kmer_index != closing_solid_kmer_index
                path = LightGraphs.a_star(graph, opening_solid_kmer_index, closing_solid_kmer_index)
                normalized_path = Int[path[1].src, [edge.dst for edge in path]...]
                shortest_paths = [normalized_path]
            else
                neighbors = LightGraphs.neighbors(graph, opening_solid_kmer_index)
                shortest_paths = Vector{Vector{Int}}()
                for neighbor in neighbors
                    path = LightGraphs.a_star(graph, neighbor, closing_solid_kmer_index)
                    normalized_path = Int[path[1].src, [edge.dst for edge in path]...]
                    push!(shortest_paths, normalized_path)
                end
            end
            # but if our bubble is a deletion, then walking that length may not get us across to the other side
            # so we also determine the longest, shortest path
            longest_shortest_walk_length = maximum(length.(shortest_paths))


            longest_walk_length = max(bubble_path_length, longest_shortest_walk_length)
            walk_length = Int(ceil(longest_walk_length * 1.1))

            walks = Vector{Int}[]
            for i in 1:3
                println("round $i")
                walks = [take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length)]
                while allunique(walks)
                    push!(walks, take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length))
                end

                walks = filter(path -> closing_solid_kmer in path, walks)

                if isempty(walks)
                    walk_length = Int(ceil(walk_length * 1.1))
                else
                    # done!
                    break
                end
            end

            if !isempty(walks)
                candidate_paths = sort(collect(StatsBase.countmap(walks)), by=x->x[2], rev=true)
                selected_path = first(first(candidate_paths))
                selected_path_closing_solid_kmer_intersects = findall(kmer -> kmer == closing_solid_kmer, selected_path)
                if length(selected_path_closing_solid_kmer_intersects) > 1
                    @show "here"
                    selected_path_closing_solid_kmer_intersect = 0
                    d = Inf
                    for i in selected_path_closing_solid_kmer_intersects
                        candidate_distance = abs(i - walk_length)
                        if candidate_distance < d
                            d = candidate_distance
                            selected_path_closing_solid_kmer_intersect = i
                        end
                    end
                else
                    selected_path_closing_solid_kmer_intersect = first(selected_path_closing_solid_kmer_intersects)
                end
                selected_path = selected_path[1:selected_path_closing_solid_kmer_intersect]
                if isempty(proposed_path)
                    proposed_path = getproperty.(observation_kmers[1:opening_solid_kmer_path_index], :fw)
                end
                append!(proposed_path, selected_path[2:end])
                opening_missing_kmer_path_index = findnext(map(x -> x == 0, kmer_index), closing_solid_kmer_path_index+1)
                if opening_missing_kmer_path_index == nothing
                    #done
                    opening_solid_kmer = nothing
                elseif opening_missing_kmer_path_index == 1
                    # also done?
                    opening_solid_kmer = nothing
                elseif opening_missing_kmer_path_index != nothing
                    need_to_add = getproperty.(observation_kmers[closing_solid_kmer_path_index+1:opening_missing_kmer_path_index-1], :fw)
                    append!(proposed_path, need_to_add)
                    opening_solid_kmer_path_index = opening_missing_kmer_path_index - 1
                    opening_solid_kmer = observation_kmers[opening_solid_kmer_path_index].fw
                    closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_missing_kmer_path_index+1)
                    if closing_solid_kmer_path_index == nothing
                        closing_solid_kmer = nothing
                    else
                        closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
                    end
                end
            else
                @show "need to bump out"
                if opening_missing_kmer_path_index >= 2
                    opening_missing_kmer_path_index -= 1
                    opening_solid_kmer = nothing
                    # this should now break out to larger while loop
                end
                # push out closing node and try again
                closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), closing_solid_kmer_path_index+1)

                if closing_solid_kmer_path_index == nothing
                    @show "end is open"
                    closing_solid_kmer = nothing
                else
                    closing_solid_kmer = observation_kmers[closing_solid_kmer_path_index].fw
                end
            end
        end
    end
    if closing_solid_kmer_path_index != nothing
        append!(proposed_path, getproperty.(observation_kmers[closing_solid_kmer_path_index+1:end], :fw))
    end
end

In [None]:
proposed_path

In [None]:
proposed_seq = BioSequences.LongDNASeq(first(proposed_path))
for kmer in proposed_path[2:end]
    push!(proposed_seq, last(kmer))
end
proposed_seq

In [None]:
new_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), proposed_seq, genome)

In [None]:
reverse_new_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), BioSequences.reverse_complement(proposed_seq), genome)

In [None]:
if reverse_new_alignment.value < new_alignment.value
#     observation = BioSequences.reverse_complement!(observation)
    new_alignment = reverse_new_alignment
    @show "flipping"
end

In [None]:
if new_alignment.value <= alignment.value
    @show "newer is equivalent or better"
    @show new_alignment.value, alignment.value
else
    @show "newer is worse, keep original"
    @show new_alignment.value, alignment.value
end