In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
# @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "BioSequences",
    "Kmers",
    "Graphs",
    "MetaGraphs",
    "SparseArrays",
    "ProgressMeter",
    "Distributions",
    "HiddenMarkovModels",
    "BioAlignments",
    "StatsBase"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
function find_contiguous_stretches_as_ranges(indices)
    # Sort the indices to ensure they are in ascending order
    sorted_indices = sort(indices)
    
    # Initialize the result list
    contiguous_stretches = UnitRange{Int64}[]
    
    # Start with the first index as both the start and end of the current stretch
    start_of_current_stretch = sorted_indices[1]
    end_of_current_stretch = sorted_indices[1]

    # Iterate through the sorted indices starting from the second element
    for i in 2:length(sorted_indices)
        # Check if the current index is consecutive to the end of the current stretch
        if sorted_indices[i] == end_of_current_stretch + 1
            end_of_current_stretch = sorted_indices[i]
        else
            # Current index is not consecutive, so the current stretch ends here
            # Add the current stretch as a range to the result list
            push!(contiguous_stretches, start_of_current_stretch:end_of_current_stretch)
            
            # Update the start and end of the new current stretch
            start_of_current_stretch = sorted_indices[i]
            end_of_current_stretch = sorted_indices[i]
        end
    end

    # Add the last stretch to the result list as a range
    push!(contiguous_stretches, start_of_current_stretch:end_of_current_stretch)

    return contiguous_stretches
end

In [None]:
function emission_likelihood(;error_rate, accuracy, observed_sequence, alternative_sequence, fast=false)
    likelihood = 1.0
    if fast
        for (a, b) in zip(observed_sequence, alternative_sequence)
            if a == b
                likelihood *= accuracy
            else
                likelihood *= error_rate
            end
        end
    else
        alignment_result = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), observed_sequence, alternative_sequence)
        distance = BioAlignments.distance(alignment_result)
        matches = length(observed_sequence) - distance
        for i in 1:matches
            likelihood *= accuracy
        end
        for i in 1:distance
            likelihood *= error_rate
        end
    end
    return likelihood
end

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
reference_fastas = sort(filter(x -> occursin(r"\.fna$", x) && !occursin("normalized", x), readdir(genome_dir, join=true)), by=x->filesize(x))

In [None]:
reference_fasta = first(reference_fastas)

In [None]:
filter(x -> occursin(reference_fasta, x) && occursin("10x", x) && occursin(r"\.fq\.gz$", x), readdir(genome_dir, join=true))

In [None]:
fastq = filter(x -> occursin(reference_fasta, x) && occursin("10x", x) && occursin("filtlong", x) && occursin(r"\.fq\.gz$", x), readdir(genome_dir, join=true))

In [None]:
initial_k = Mycelia.assess_dnamer_saturation(fastq, plot=false)

In [None]:
initial_kmer_type = Kmers.DNAKmer{initial_k}

In [None]:
# kmer_iterator = Kmers.EveryKmer{initial_kmer_type}(BioSequences.LongDNA{2}(FASTX.sequence(first(collect(Mycelia.open_fastx(first(fastq)))))))

In [None]:
raw_graph = Mycelia.fastx_to_kmer_graph(initial_kmer_type, fastq)
solid_kmers = findall(v -> MetaGraphs.get_prop(graph, v, :count) >= 3, Graphs.vertices(graph))
graph, vertex_map = Graphs.induced_subgraph(graph, solid_kmers)
MetaGraphs.set_indexing_prop!(graph, :kmer)

In [None]:
# build a directed, simple graph from undirected metagraph
canonical_kmers = (MetaGraphs.get_prop(graph, v, :kmer) for v in Graphs.vertices(graph))
unique_stranded_kmers = unique(sort(reduce(vcat, ([canonical_kmer, BioSequences.reverse_complement(canonical_kmer)] for canonical_kmer in canonical_kmers))))
stranded_kmer_graph = MetaGraphs.MetaDiGraph(length(unique_stranded_kmers))
MetaGraphs.set_prop!(stranded_kmer_graph, :k, MetaGraphs.get_prop(graph, :k))
@info "adding kmers"
for (i, kmer) in enumerate(unique_stranded_kmers)
    MetaGraphs.set_prop!(stranded_kmer_graph, i, :kmer, kmer)
end
@info "indexing kmers"
# allow graph[kmer, :kmer] to dict-lookup the index of a kmer
MetaGraphs.set_indexing_prop!(stranded_kmer_graph, :kmer)

for vertex in Graphs.vertices(graph)
    observation_count = MetaGraphs.get_prop(graph, vertex, :count)
    canonical_kmer = MetaGraphs.get_prop(graph, vertex, :kmer)
    MetaGraphs.set_prop!(stranded_kmer_graph, stranded_kmer_graph[canonical_kmer, :kmer], :count, observation_count)
    MetaGraphs.set_prop!(stranded_kmer_graph, stranded_kmer_graph[BioSequences.reverse_complement(canonical_kmer), :kmer], :count, observation_count)
end

# iterate through edges in graph and use edge evidence to build these stranded edges and their weights
for edge in Graphs.edges(graph)
    for edge_observation in MetaGraphs.get_prop(graph, edge, :observations)
        
        ##### forward
        source = (index=edge.src, orientation=first(edge_observation.orientation))
        oriented_source = graph[source.index, :kmer]
        if !source.orientation
            oriented_source = BioSequences.reverse_complement(oriented_source)
        end
        
        dest = (index=edge.dst, orientation=last(edge_observation.orientation))
        oriented_dest = graph[dest.index, :kmer]
        if !dest.orientation
            oriented_dest = BioSequences.reverse_complement(oriented_dest)
        end
        
        stranded_edge = Graphs.Edge(stranded_kmer_graph[oriented_source, :kmer] => stranded_kmer_graph[oriented_dest, :kmer])
        stranded_observation = (record_identifier = edge_observation.record_identifier, index = edge_observation.index)
        if !Graphs.has_edge(stranded_kmer_graph, stranded_edge)
            Graphs.add_edge!(stranded_kmer_graph, stranded_edge)
            MetaGraphs.set_prop!(stranded_kmer_graph, stranded_edge, :observations, Set([stranded_observation]))
        else
            stranded_observations = push!(MetaGraphs.get_prop(stranded_kmer_graph, stranded_edge, :observations), stranded_observation)
            MetaGraphs.set_prop!(stranded_kmer_graph, stranded_edge, :observations, stranded_observations)
        end
        
        
        ##### reverse_complement
        source = (index=edge.dst, orientation=!last(edge_observation.orientation))
        oriented_source = graph[source.index, :kmer]
        if !source.orientation
            oriented_source = BioSequences.reverse_complement(oriented_source)
        end
        
        dest = (index=edge.src, orientation=!first(edge_observation.orientation))
        oriented_dest = graph[dest.index, :kmer]
        if !dest.orientation
            oriented_dest = BioSequences.reverse_complement(oriented_dest)
        end
        
        stranded_edge = Graphs.Edge(stranded_kmer_graph[oriented_source, :kmer] => stranded_kmer_graph[oriented_dest, :kmer])
        stranded_observation = (record_identifier = edge_observation.record_identifier, index = edge_observation.index)
        if !Graphs.has_edge(stranded_kmer_graph, stranded_edge)
            Graphs.add_edge!(stranded_kmer_graph, stranded_edge)
            MetaGraphs.set_prop!(stranded_kmer_graph, stranded_edge, :observations, Set([stranded_observation]))
        else
            stranded_observations = push!(MetaGraphs.get_prop(stranded_kmer_graph, stranded_edge, :observations), stranded_observation)
            MetaGraphs.set_prop!(stranded_kmer_graph, stranded_edge, :observations, stranded_observations)
        end
        
    end
end
stranded_kmer_graph

In [None]:
kmer_counts = map(v -> MetaGraphs.get_prop(stranded_kmer_graph, v, :count), Graphs.vertices(stranded_kmer_graph))
state_likelihoods = kmer_counts ./ sum(kmer_counts)
ranked_state_likelihoods = sortperm(state_likelihoods, rev=true)

transition_likelihoods = SparseArrays.spzeros(Graphs.nv(stranded_kmer_graph), Graphs.nv(stranded_kmer_graph))
ProgressMeter.@showprogress for v in Graphs.vertices(stranded_kmer_graph)
    dijkstra = Graphs.dijkstra_shortest_paths(stranded_kmer_graph, v)
    # 0 distance is a deletion, but let's make that a 
    distances = map(x -> x == 0 ? 2 : x, dijkstra.dists)
    inverse_distance_based_transition_likelihoods = 1 ./ distances
    transition_likelihoods[v, :] .= inverse_distance_based_transition_likelihoods
end
ProgressMeter.@showprogress for v in Graphs.vertices(stranded_kmer_graph)
    transition_likelihoods[v, :] .= transition_likelihoods[v, :] ./ sum(transition_likelihoods[v, :])
end
transition_likelihoods

In [None]:
assumed_error_rate = 1 / (MetaGraphs.get_prop(graph, :k) + 1)
assumed_accuracy = 1 - assumed_error_rate

In [None]:
kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(map(v -> MetaGraphs.get_prop(stranded_kmer_graph, v, :kmer), Graphs.vertices(stranded_kmer_graph))))

In [None]:
records = collect(values(MetaGraphs.get_prop(graph, :records)))
record = first(records)
sequence = BioSequences.LongDNA{4}(FASTX.sequence(record))


# observed_kmer_states = collect(Kmers.EveryKmer{Kmers.DNAKmer{MetaGraphs.get_prop(graph, :k)}}(sequence))
# observed_kmer_indices = [stranded_kmer_graph[kmer, :kmer] for kmer in last.(observed_kmer_states)]

In [None]:
observed_kmers = last.(Kmers.EveryKmer{Kmers.DNAKmer{MetaGraphs.get_prop(graph, :k)}}(sequence))
observed_kmer_indices = [get(kmer_indices, observed_kmer, missing) for observed_kmer in observed_kmers]

contiguous_stretches = find_contiguous_stretches_as_ranges(findall(ismissing, observed_kmer_indices))

for (i, contiguous_stretch) in enumerate(contiguous_stretches)
    if length(contiguous_stretch) < MetaGraphs.get_prop(graph, :k)
        n_short = MetaGraphs.get_prop(graph, :k) - length(contiguous_stretch)
        n_to_prune_on_each_side = Int(ceil(n_short / 2))
        for i in first(contiguous_stretch)-n_to_prune_on_each_side:first(contiguous_stretch)
            observed_kmer_indices[i] = missing
        end
        for i in last(contiguous_stretch):last(contiguous_stretch)+n_to_prune_on_each_side
            observed_kmer_indices[i] = missing
        end        
    end
end
contiguous_stretches
observed_kmer_indices
# contiguous_stretches = find_contiguous_stretches_as_ranges(findall(ismissing, observed_kmer_indices))

In [None]:
contiguous_stretches = find_contiguous_stretches_as_ranges(findall(ismissing, observed_kmer_indices))

In [None]:
for contiguous_stretch in contiguous_stretches
    # @show contiguous_stretch
    replacement_path_length = 0
    alternate_path = []
    prior_solid_kmer_index = first(contiguous_stretch)
    posterior_solid_kmer_index = last(contiguous_stretch)
    while (replacement_path_length == 0) && (prior_solid_kmer_index > 1) && (posterior_solid_kmer_index < length(observed_kmer_indices))
        prior_solid_kmer_index -= 1
        posterior_solid_kmer_index += 1
        alternate_path = Graphs.a_star(stranded_kmer_graph, observed_kmer_indices[prior_solid_kmer_index], observed_kmer_indices[posterior_solid_kmer_index])
        replacement_path_length = length(alternate_path)
    end
    @show length(alternate_path), length(contiguous_stretch), replacement_path_length
    @show prior_solid_kmer_index, posterior_solid_kmer_index
end

In [None]:
for contigu

In [None]:
# # emission_probability_distributions = Distributions.Categorical{Float64, Vector{Float64}}[]
# # ProgressMeter.@showprogress for v in Graphs.vertices(stranded_kmer_graph)
# #     # @show v
# #     state_emission_likelihoods = ones(Graphs.nv(stranded_kmer_graph))
# #     for (i, v2) in enumerate(Graphs.vertices(stranded_kmer_graph))
# #         state_emission_likelihoods[i] *= emission_likelihood(
# #                 error_rate=assumed_error_rate,
# #                 accuracy=assumed_accuracy,
# #                 observed_sequence=stranded_kmer_graph[v, :kmer],
# #                 alternative_sequence=stranded_kmer_graph[v2, :kmer])
        
# #     end
# #     state_emission_likelihoods = state_emission_likelihoods ./ sum(state_emission_likelihoods)
# #     push!(emission_probability_distributions, Distributions.Categorical(state_emission_likelihoods))
# # end
# # emission_probability_distributions
# # hmm = HiddenMarkovModels.HMM(state_likelihoods, transition_likelihoods, emission_probability_distributions)

# # Forward

# # Viterbri

# viterbi(hmm, obs_seq)

# records = collect(values(MetaGraphs.get_prop(graph, :records)))
# record = first(records)
# sequence = BioSequences.LongDNA{4}(FASTX.sequence(record))
# observed_kmer_states = collect(Kmers.EveryKmer{Kmers.DNAKmer{MetaGraphs.get_prop(graph, :k)}}(sequence))
# observed_kmer_indices = [stranded_kmer_graph[kmer, :kmer] for kmer in last.(observed_kmer_states)]
# log_likelihood_observation = last(HiddenMarkovModels.forward(hmm, observed_kmer_indices))

# HiddenMarkovModels.viterbi(hmm, observed_kmer_indices)

# maximum_likelihood_observation = 

In [None]:

# arrivals = Dict{NamedTuple{(:record_index, :state_index), Tuple{Int64, Int64}}, Graphs.SimpleGraphs.SimpleEdge{Int64}}()
# state_evaluation_matrix = SparseArrays.spzeros(BigFloat, Graphs.nv(stranded_kmer_graph), length(observed_kmer_states))
# (current_record_index, observed_kmer_state) = first(observed_kmer_states)
# prior_record_index = current_record_index - 1
# ProgressMeter.@showprogress for (state_index, alt_kmer) in enumerate(unique_stranded_kmers)
#     state_evaluation_matrix[state_index, current_record_index] = emission_likelihood(
#         error_rate=assumed_error_rate,
#         accuracy=assumed_accuracy,
#         observed_sequence=observed_kmer_state,
#         alternative_sequence=alt_kmer)
# end

In [None]:
# ProgressMeter.@showprogress for (current_record_index, observed_kmer_state) in observed_kmer_states[2:end]
#     prior_record_index = current_record_index - 1
#     ranked_prior_states = sortperm(state_evaluation_matrix[:, prior_record_index], rev=true)
#     matching_kmer_index = stranded_kmer_graph[observed_kmer_state, :kmer]
#     ranked_alternative_kmer_indices = filter(x -> x != matching_kmer_index, ranked_state_likelihoods)
#     ranked_next_states = vcat([matching_kmer_index], ranked_alternative_kmer_indices)

#     for source_index in ranked_prior_states
#         source_likelihood = state_evaluation_matrix[source_index, prior_record_index]
#         if source_likelihood > 0
#             for destination_index in ranked_next_states
#                 current_destination_likelihood = state_evaluation_matrix[destination_index, current_record_index]
#                 # we have a chance to find a higher likelihood path
#                 if source_likelihood > current_destination_likelihood
#                     considered_destination_likelihood = source_likelihood * transition_likelihoods[source_index, destination_index] * state_likelihoods[destination_index]
#                     if considered_destination_likelihood > current_destination_likelihood
#                         considered_destination_likelihood *= emission_likelihood(
#                             error_rate=assumed_error_rate,
#                             accuracy=assumed_accuracy,
#                             observed_sequence=observed_kmer_state,
#                             alternative_sequence=stranded_kmer_graph[destination_index, :kmer])
#                         if considered_destination_likelihood > current_destination_likelihood
#                             state_evaluation_matrix[destination_index, current_record_index] = considered_destination_likelihood
#                             arrivals[(record_index = current_record_index, state_index = destination_index)] = Graphs.Edge(source_index, destination_index)
#                         end
#                     end
#                 end
#             end
#         end
#     end
# end

In [None]:
# using Distributions, HiddenMarkovModels
# init = [0.4, 0.6]
# trans = [0.9 0.1; 0.2 0.8]
# dists = [Normal(-1.0), Normal(1.0)]
# hmm = HMM(init, trans, dists)

In [None]:
# shortest_paths = Dict{Graphs.Edge, NamedTuple{(:path, :likelihood), Tuple{Vector{Graphs.SimpleGraphs.SimpleEdge{Int64}}, Float64}}}()
# # 5 hours to solve - too long!
# ProgressMeter.@showprogress for row in 1:size(transition_likelihoods, 1)
#     for col in 1:size(transition_likelihoods, 2)
#         path_likelihood = transition_likelihoods[row, col]
#         if 1 > path_likelihood > 0
#             distance = Int(1 / path_likelihood) - 1
#             edge = Graphs.Edge(row, col)
#             if distance == 1
#                 path = [edge]
#             elseif distance > 1
#                 path = Graphs.a_star(stranded_kmer_graph, row, col)
#             end
#             for edge in path
#                 outneighbors = Graphs.outneighbors(stranded_kmer_graph, edge.src)
#                 outneighbor_weights = [length(MetaGraphs.get_prop(stranded_kmer_graph, Graphs.Edge(edge.src, outneighbor), :observations)) for outneighbor in outneighbors]
#                 target_index = findfirst(x -> x == edge.dst, outneighbors)
#                 outneighbor_likelihoods = outneighbor_weights ./ sum(outneighbor_weights)
#                 path_likelihood *= outneighbor_likelihoods[target_index]
#             end
#             shortest_paths[edge] = (path = path, likelihood = path_likelihood)
#         end
#         transition_likelihoods[row, col] = path_likelihood
#     end
# end
# shortest_paths

In [None]:
# solve the 

In [None]:
# function initialize_transition_probabilities(kmer_graph)
    
#     total_kmers = Graphs.nv(kmer_graph)
#     transition_likelihoods = Dict(
#         true => SparseArrays.spzeros(total_kmers, total_kmers),
#         false => SparseArrays.spzeros(total_kmers, total_kmers)
#     )

#     for edge in collect(Graphs.edges(kmer_graph))
# #         weight = length(kmer_graph.eprops[edge][:evidence])
#         weight = kmer_graph.eprops[edge][:weight]
#         for o in kmer_graph.eprops[edge][:orientations]
#             transition_likelihoods[o.source_orientation][edge.src, edge.dst] = weight
#         end
#     end

#     for source_orientation in (true, false)
#         for src in 1:total_kmers
#             transition_weights = transition_likelihoods[source_orientation][src, :]
#             total_weight = sum(transition_weights)
#             dsts, vals = SparseArrays.findnz(transition_weights)
#             for (dst, val) in zip(dsts, vals) 
#                 transition_likelihoods[source_orientation][src, dst] = val / total_weight
#             end
#             normalized_probability = sum(transition_likelihoods[source_orientation][src, :])
#             @assert isapprox(normalized_probability, 0) || isapprox(normalized_probability, 1)
#         end
#     end
#     return transition_likelihoods
# end

In [None]:
# function run_viterbi!(
#         current_state,
#         prior_state,
#         observed_nucleotide,
#         observed_quality_score,
#         observed_error_rate,
#         current_vertex,
#         prior_vertex,
#         state_likelihoods,
#         transition_likelihoods,
#         shortest_paths,
#         arrival_paths,
#         kmer_graph,
#         kmer_likelihoods
#         )
#     # if probability of prior state is lower than current probability, skip

# #     @show current_state
# #     @show prior_state
# #     @show current_vertex
# #     @show prior_vertex
    
    
#     current_state_likelihood = state_likelihoods[current_vertex, current_state]
#     prior_state_likelihood = state_likelihoods[prior_vertex, prior_state]

#     # if we already have a better possible path, skip calculating anything
#     if prior_state_likelihood < current_state_likelihood
# #         @show prior_state_likelihood < current_state_likelihood
#         return
#     end

#     # take shortest path and assume it's the maximum likelihood path
#     # this assumption seems fair because in an ideal situation
#     # we're just moving to an adjacent kmer
#     # and the shortest path and most likely path should be the same
#     shortest_path = shortest_paths[prior_vertex][current_vertex]
    
# #     no path & not considering insertion
#     if isempty(shortest_path) && (prior_vertex != current_vertex)
# #         @show "no path, skipping"
#         return
#     end
    
#     # if shortest path isn't viable, exit
#     if !isempty(shortest_path)
# #         @show "checking if path is viable"

#         terminal_orientation_prior_state = last(last(arrival_paths[prior_vertex, prior_state]))
# #         @show arrival_paths[prior_vertex, prior_state]
# #         @show "we were at vertex $(prior_vertex) in orientation $(terminal_orientation_prior_state)"
#         candidate_edge = Graphs.Edge(shortest_path[1], shortest_path[2])
                
#         if !ismissing(terminal_orientation_prior_state) && 
#             !any(o -> o.source_orientation == terminal_orientation_prior_state, kmer_graph.eprops[candidate_edge][:orientations])
            
# #             @show "no viable orientation matching edges detected between $(candidate_edge)"
# #             @show "full candidate path was $(shortest_path)"
# #             @show "orientation options were:"
# #             @show kmer_graph.eprops[candidate_edge][:orientations]
#             return
#         end
#     end
    
#     # zero step path - insertion in observed sequence relative to kmer graph
#     is_same_vertex = (current_vertex == prior_vertex)
#     has_edge = Graphs.has_edge(kmer_graph, Graphs.Edge(prior_vertex, current_vertex))
#     if is_same_vertex && has_edge
#         shortest_path = [prior_vertex, current_vertex]
#     end
    
#     if is_same_vertex
# #         @show "same vertex, considering insertion potential"
#         emission_likelihood = observed_error_rate
#         transition_likelihood = observed_error_rate
#         state_likelihood = kmer_likelihoods[current_vertex]
#         path_likelihood = prior_state_likelihood * emission_likelihood * transition_likelihood * state_likelihood
#         path = [last(arrival_paths[prior_vertex, prior_state])]

#         if current_state_likelihood > state_likelihoods[current_vertex, current_state]
# #             @show "selecting path"
# #             @show path
# #             @show path_likelihood
#             state_likelihoods[current_vertex, current_state] = path_likelihood
#             arrival_paths[current_vertex, current_state] = path
#         end
#     # one or more step path - match, mismatch, or deletion in observed sequence relative to kmer graph
#     elseif !isempty(shortest_path)
# #         @show "path is viable!"
# #         @show "considering shortest path: $(shortest_path)"

#         initial_path_state = last(arrival_paths[prior_vertex, prior_state])

#         path = Vector{typeof(initial_path_state)}(undef, length(shortest_path))
#         path[1] = initial_path_state

#         path_likelihood::Float64 = state_likelihoods[prior_vertex, prior_state]

#         for i in 2:length(shortest_path)

#             this_vertex = shortest_path[i]
#             prior_vertex, prior_orientation = path[i-1]
#             edge = Graphs.Edge(prior_vertex, this_vertex)

#             possible_edge_orientations::Set{NamedTuple{(:source_orientation, :destination_orientation), Tuple{Bool, Bool}}} = kmer_graph.eprops[edge][:orientations]
            
# #             @show possible_edge_orientations
            
#             if !ismissing(prior_orientation)
#                 possible_edge_orientations = filter(o -> o.source_orientation == prior_orientation, possible_edge_orientations)
#             end
            
# #             @show possible_edge_orientations
            
#             if isempty(possible_edge_orientations)
#                 path_likelihood *= 0.0
#                 path = Vector{eltype(path)}()
# #                 @show "no possible orientations, bailing early"
#                 break
#             end

# #             @show prior_orientation
#             if ismissing(prior_orientation)
#                 if transition_likelihoods[true][prior_vertex, this_vertex] > transition_likelihoods[false][prior_vertex, this_vertex]
#                     prior_orientation = true
#                     transition_likelihood = transition_likelihoods[true][prior_vertex, this_vertex]::Float64
#                 elseif transition_likelihoods[true][prior_vertex, this_vertex] < transition_likelihoods[false][prior_vertex, this_vertex]
#                     prior_orientation = false
#                     transition_likelihood = transition_likelihoods[false][prior_vertex, this_vertex]::Float64
#                 else transition_likelihoods[true][prior_vertex, this_vertex] == transition_likelihoods[false][prior_vertex, this_vertex]
#                     prior_orientation = missing
#                     transition_likelihood = transition_likelihoods[true][prior_vertex, this_vertex]::Float64
#                 end
#             else
#                 transition_likelihood = transition_likelihoods[prior_orientation][prior_vertex, this_vertex]::Float64
#             end
#             state_likelihood::Float64 = kmer_likelihoods[this_vertex]
#             path_likelihood *= transition_likelihood * state_likelihood
            
#             if length(possible_edge_orientations) == 1
#                 orientation = first(possible_edge_orientations).destination_orientation
#                 path[i] = this_vertex => orientation
#             else
#                 path[i] = this_vertex => missing
#             end
#         end

#         # see if new nucleotide is a match or mismatch to terminal kmer in path
#         if !isempty(path) && path_likelihood > 0
#             terminal_kmer_index, terminal_kmer_orientation = last(path)
#             terminal_kmer = BioSequences.LongDNASeq(kmer_graph.vprops[terminal_kmer_index][:kmer])::BioSequences.LongDNASeq
#             if ismissing(terminal_kmer_orientation)
#                 fw_is_match = observed_nucleotide == last(terminal_kmer)
#                 bw_is_match = observed_nucleotide == last(BioSequences.reverse_complement!(terminal_kmer))
#                 if fw_ismatch && !bw_is_match
#                     path[end] = terminal_kmer_index => true
#                     path_likelihood *= 1 - observed_error_rate
#                 elseif !fw_ismatch && bw_is_match
#                     path[end] = terminal_kmer_index => false
#                     path_likelihood *= 1 - observed_error_rate
#                 elseif fw_ismatch && bw_is_match
#                     path_likelihood *= 1 - observed_error_rate
#                 elseif !fw_ismatch && !bw_is_match
#                     path_likelihood *= observed_error_rate
#                 end
#             elseif terminal_kmer_orientation
#                 is_match = observed_nucleotide == last(terminal_kmer)
#                 if is_match
#                     path_likelihood *= 1 - observed_error_rate
#                 else
#                     path_likelihood *= observed_error_rate
#                 end
#             else
#                 terminal_kmer = BioSequences.reverse_complement!(terminal_kmer)
#                 is_match = observed_nucleotide == last(terminal_kmer)
#                 if is_match
#                     path_likelihood *= 1 - observed_error_rate
#                 else
#                     path_likelihood *= observed_error_rate
#                 end
#             end
#         end

#         if path_likelihood > state_likelihoods[current_vertex, current_state]
# #             @show "selecting path"
# #             @show path
# #             @show path_likelihood
#             state_likelihoods[current_vertex, current_state] = path_likelihood
#             arrival_paths[current_vertex, current_state] = path
#         end
#     end
#     return
# end

In [None]:
# function polish_fastq(kmer_graph, fastq_file)

# #     @info "Assessing kmer likelihoods"
#     kmers = [kmer_graph.vprops[v][:kmer] for v in Graphs.vertices(kmer_graph)]
# #     kmer_counts = [length(kmer_graph.vprops[v][:evidence]) for v in Graphs.vertices(kmer_graph)]
#     kmer_counts = [kmer_graph.vprops[v][:weight] for v in Graphs.vertices(kmer_graph)]
#     kmer_likelihoods = kmer_counts ./ sum(kmer_counts)
#     k = kmer_graph.gprops[:k]
#     kmer_type = BioSequences.BigDNAMer{k}
#     total_kmers = length(kmers)
    
# #     @info "determining shortest paths between kmers"
#     shortest_paths = Graphs.enumerate_paths(Graphs.floyd_warshall_shortest_paths(kmer_graph));
    
#     @info "counting the number of records to establish runtime estimate"
#     number_of_records = 0
#     for fastq_record in FASTX.FASTQ.Reader(open(fastq_file))
#         number_of_records += 1
#     end
#     progress_bar = ProgressMeter.Progress(number_of_records, 1)
    
#     output_fastq_file = replace(fastq_file, ".fastq" => ".k$(kmer_graph.gprops[:k]).fastq")
#     fastq_writer = FASTX.FASTQ.Writer(open(output_fastq_file, "w"))
#     for fastq_record in FASTX.FASTQ.Reader(open(fastq_file))
#         ProgressMeter.next!(progress_bar)
        
# #         @info "Initializing matrices"
#         total_states = length(FASTX.sequence(fastq_record))-k+1
#         transition_likelihoods = initialize_transition_probabilities(kmer_graph)
#         state_likelihoods = zeros(total_kmers, total_states)
#         arrival_paths = fill(Pair{Int, Union{Bool, Missing}}[], total_kmers, total_states)

# #         @info "Determining Likelihoods of initial states"
#         initial_state = first(BioSequences.each(kmer_type, FASTX.sequence(fastq_record)))
#         current_state = 1
#         # note this is a place for potential improvement, use the q value at each base to guide probability rather than median
#         median_q_value = Statistics.median(Int.(FASTX.quality(fastq_record)[1:k]))
#         current_error_rate = q_value_to_error_rate(median_q_value)
#         # canonical_kmer = BioSequences.canonical(initial_state.fw)
#         set_initial_state_likelihoods!(
#                 kmer_graph,
#                 initial_state,
#                 kmer_likelihoods,
#                 current_error_rate,
#                 state_likelihoods,
#                 arrival_paths
#             )

# #         @info "Determining likelihood of downstream states"

# #         non_singleton_states = findall(kmer_counts .> 1)

#         for current_state in 2:total_states
#             prior_state = current_state - 1

#         #     observed_kmer = BioSequences.BigDNAMer{k}(FASTX.sequence(fastq_record)[current_state:current_state+k-1])

#         #     @assert observed_kmer == collect(BioSequences.each(kmer_type, FASTX.sequence(fastq_record)))[current_state].fw

#         #     canonical_kmer = BioSequences.canonical(observed_kmer)

#             observed_nucleotide = FASTX.sequence(fastq_record)[k-1+current_state]
#         #     observed_nucleotide = last(observed_kmer)
#             observed_quality_score = FASTX.quality(fastq_record)[k-1+current_state]
#             observed_error_rate = q_value_to_error_rate(observed_quality_score)

#             # we'll assess prior states in order of decreasing likelihood
#             # such that we maximize how frequently we are able to utilize the
#             # current_state_likelihood > candidate prior state
#             # break that won't bother evaluating lower likelihood possibilities
#             prior_states_in_decreasing_likelihood = sortperm(state_likelihoods[:, prior_state], rev=true)

#             # and skip all prior states with zero probability

#             for current_vertex in total_states
#                 for prior_vertex in prior_states_in_decreasing_likelihood
#                     if state_likelihoods[prior_vertex, prior_state] > 0
#                         run_viterbi!(
#                                 current_state,
#                                 prior_state,
#                                 observed_nucleotide,
#                                 observed_quality_score,
#                                 observed_error_rate,
#                                 current_vertex,
#                                 prior_vertex,
#                                 state_likelihoods,
#                                 transition_likelihoods,
#                                 shortest_paths,
#                                 arrival_paths,
#                                 kmer_graph,
#                                 kmer_likelihoods
#                                 )
#                     end
#                 end
#             end
#         end

# #         try
#         maximum_likelihood_path, maximum_likelihood_value = 
#             determine_maximum_likelihood_path(
#                 state_likelihoods,
#                 arrival_paths
#                 )
# #         catch
# #             return state_likelihoods, arrival_paths
# #         end

#         sequence = oriented_path_to_sequence(kmer_graph, maximum_likelihood_path)

# #         @info "comparing to original path"
#         original_sequence_likelihood = oriented_path_to_likelihood(kmer_graph, kmers, kmer_likelihoods, transition_likelihoods, fastq_record)
#         relative_likelihood = maximum_likelihood_value / original_sequence_likelihood
# #         relative_likelihood_formatted = NumericIO.formatted(relative_likelihood, ndigits=1, charset=:ASCII)
# #         println("relative likelihood of new path to old path is $(relative_likelihood_formatted)")

# #         @info "writing updated record"
#         identifier = FASTX.identifier(fastq_record) * "_k$(k)"
#         description = string(relative_likelihood)
#         # because the sequences won't always be the same length, we take an ordered sampling with replacement
#         # which introduces some random error but preserves overall patterns and areas of high/low accuracy
#         quality_scores = StatsBase.sample(FASTX.quality(fastq_record), length(sequence), ordered=true)

#         new_fastq_record = FASTX.FASTQ.Record(
#             identifier,
#             description,
#             sequence,
#             quality_scores
#         )
#         write(fastq_writer, new_fastq_record)
#     end
#     close(fastq_writer)
#     return output_fastq_file
# end

In [None]:
# function determine_maximum_likelihood_path(
#     state_likelihoods,
#     arrival_paths
#     )
#     maximum_likelihood_value = maximum(state_likelihoods[:, end])

#     maximum_likelihood_path_indices = findall(state_likelihoods[:, end] .== maximum_likelihood_value)

#     # if multiple paths are tied, randomly choose one
#     maximum_likelihood_path_index = rand(maximum_likelihood_path_indices)

#     maximum_likelihood_path = arrival_paths[maximum_likelihood_path_index, end]

#     for state_index in size(state_likelihoods, 2)-1:-1:1
#         next_kmer, next_orientation = first(maximum_likelihood_path)
#         maximum_likelihood_arrival_path = arrival_paths[next_kmer, state_index]
        
#         is_match = last(maximum_likelihood_arrival_path) == (next_kmer => next_orientation)
#         if !ismissing(is_match) && !is_match
#             error("breaking")
#         end
#         maximum_likelihood_path = vcat(maximum_likelihood_arrival_path[1:end-1], maximum_likelihood_path)
#     end
#     return maximum_likelihood_path, maximum_likelihood_value
# end

In [None]:
# Mycelia.polish_fastq(graph, fastq)