In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "BioSequences",
    "Kmers",
    "Graphs",
    "MetaGraphs",
    "SparseArrays",
    "ProgressMeter",
    "Distributions",
    "HiddenMarkovModels",
    "BioAlignments",
    "StatsBase",
    "Random",
    "StatsPlots",
    "Statistics",
    # "GraphMakie",
    "IterTools",
    "Primes",
    "OnlineStats",
    "IteratorSampling",
    "HypothesisTests",
    "Clustering",
    "Distances",
    "BioAlignments",
    "Statistics",
    "Primes"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
working_dir = joinpath(data_dir, "test")
mkpath(working_dir)

In [None]:
# short_read_sets = unique(map(x -> match(r"^(.+\.\d+x)\.", x).captures[1], filter(x -> occursin(r"\.fna\.art", x) && occursin(r"\.fq\.gz", x) && !occursin("trimming_report", x) && !occursin("_val_", x), sort(readdir(genome_dir, join=true), by=x->filesize(x)))))
# # forward = short_read_set * ".1_val_1.fq.gz"
# # reverse = short_read_set * ".2_val_2.fq.gz"

In [None]:
long_read_fastqs = sort(filter(x -> occursin(r"\.filtlong\.fq\.gz$", x), readdir(genome_dir, join=true)), by=x->filesize(x))
fastq = long_read_fastqs[1]

In [None]:
# reference_fasta = replace(fastq, r"\.badread.*" => "")

In [None]:
function find_resampling_stretches(;record_kmer_solidity, solid_branching_kmer_indices)
    indices = findall(.!record_kmer_solidity)  # Find the indices of false values
    if isempty(indices)
        return UnitRange{Int64}[]
    end
    
    diffs = diff(indices)  # Calculate the differences between consecutive indices
    # @show diffs
    range_starts = [indices[1]]  # Start with the first false index
    range_ends = Int[]
    
    for (i, d) in enumerate(diffs)
        if d > 1
            push!(range_ends, indices[i])
            push!(range_starts, indices[i+1])
        end
    end
    
    push!(range_ends, indices[end])  # Add the last false index as a range end
    
    low_quality_runs = [(start, stop) for (start, stop) in zip(range_starts, range_ends)]
    
    resampling_stretches = UnitRange{Int64}[]
    
    for low_quality_run in low_quality_runs
        unders = filter(solid_branching_kmer -> solid_branching_kmer < first(low_quality_run), solid_branching_kmer_indices)
        overs = filter(solid_branching_kmer -> solid_branching_kmer > last(low_quality_run), solid_branching_kmer_indices)
        if isempty(overs) || isempty(unders)
            continue
        else
            nearest_under = maximum(unders)
            nearest_over = minimum(overs)
            push!(resampling_stretches, nearest_under:nearest_over)
        end
    end
    if !allunique(resampling_stretches)
        resampling_stretches = unique!(resampling_stretches)
    end
    return resampling_stretches
end

In [None]:
function fastq_record(;identifier, sequence, quality_scores)
    # Fastx wont parse anything higher than 93
    quality_scores = min.(quality_scores, 93)
    record_string = join(["@" * identifier, sequence, "+", join([Char(x+33) for x in quality_scores])], "\n")
    return FASTX.parse(FASTX.FASTQRecord, record_string)
end

In [None]:
function process_fastq_record(;record, kmer_graph, yen_k_shortest_paths_and_weights, yen_k=7)
    ordered_kmers = MetaGraphs.get_prop(kmer_graph, :ordered_kmers)
    likely_valid_kmers = Set(ordered_kmers[MetaGraphs.get_prop(kmer_graph, :likely_valid_kmer_indices)])
    kmer_to_index_map = MetaGraphs.get_prop(kmer_graph, :kmer_indices)
    branching_nodes_set = MetaGraphs.get_prop(kmer_graph, :branching_nodes)
    assembly_k = MetaGraphs.get_prop(kmer_graph, :assembly_k)
    transition_likelihoods = MetaGraphs.get_prop(kmer_graph, :transition_likelihoods)
    kmer_mean_quality = MetaGraphs.get_prop(kmer_graph, :kmer_mean_quality)
    kmer_total_quality = MetaGraphs.get_prop(kmer_graph, :kmer_total_quality)
    
    new_record_identifier = FASTX.identifier(record) * ".k$(assembly_k)"
    record_sequence = BioSequences.LongDNA{4}(FASTX.sequence(record))

    kmer_type = Kmers.DNAKmer{assembly_k}
    record_kmers = last.(collect(Kmers.EveryKmer{kmer_type}(record_sequence)))
    record_quality_scores = collect(FASTX.quality_scores(record))
    record_kmer_quality_scores = [record_quality_scores[i:i+assembly_k-1] for i in 1:length(record_quality_scores)-assembly_k+1]
    
    record_kmer_solidity = map(kmer -> kmer in likely_valid_kmers, record_kmers)
    record_branching_kmers = [kmer_to_index_map[kmer] in branching_nodes_set for kmer in record_kmers]
    record_solid_branching_kmers = record_kmer_solidity .& record_branching_kmers
    
    # trim beginning of fastq
    initial_solid_kmer = findfirst(record_kmer_solidity)
    if isnothing(initial_solid_kmer)
        return record
    elseif initial_solid_kmer > 1
        record_kmers = record_kmers[initial_solid_kmer:end]
        record_kmer_quality_scores = record_kmer_quality_scores[initial_solid_kmer:end]
        record_kmer_solidity = map(kmer -> kmer in likely_valid_kmers, record_kmers)
        record_branching_kmers = [kmer_to_index_map[kmer] in branching_nodes_set for kmer in record_kmers]
        record_solid_branching_kmers = record_kmer_solidity .& record_branching_kmers
    end
    initial_solid_kmer = 1
    
    # trim end of fastq
    last_solid_kmer = findlast(record_kmer_solidity)
    if last_solid_kmer != length(record_kmer_solidity)
        record_kmers = record_kmers[1:last_solid_kmer]
        record_kmer_quality_scores = record_kmer_quality_scores[1:last_solid_kmer]
        record_kmer_solidity = map(kmer -> kmer in likely_valid_kmers, record_kmers)
        record_branching_kmers = [kmer_to_index_map[kmer] in branching_nodes_set for kmer in record_kmers]
        record_solid_branching_kmers = record_kmer_solidity .& record_branching_kmers
    end
    
    # identify low quality runs and the solid branchpoints we will use for resampling
    solid_branching_kmer_indices = findall(record_solid_branching_kmers)
    resampling_stretches = find_resampling_stretches(;record_kmer_solidity, solid_branching_kmer_indices)

    # nothing to do
    if isempty(resampling_stretches)
        return record
    end
    trusted_range = 1:max(first(first(resampling_stretches))-1, 1)
    
    new_record_kmers = record_kmers[trusted_range]
    new_record_kmer_qualities = record_kmer_quality_scores[trusted_range]
    
    
    for (i, resampling_stretch) in enumerate(resampling_stretches)
        starting_solid_kmer = record_kmers[first(resampling_stretch)]
        ending_solid_kmer = record_kmers[last(resampling_stretch)]
        
        current_quality_scores = record_quality_scores[resampling_stretch]
        u = kmer_to_index_map[starting_solid_kmer]
        v = kmer_to_index_map[ending_solid_kmer]
        if !haskey(yen_k_shortest_paths_and_weights, u => v)
            yen_k_result = Graphs.yen_k_shortest_paths(kmer_graph, u, v, Graphs.weights(kmer_graph), yen_k)
            yen_k_shortest_paths_and_weights[u => v] = Vector{Pair{Vector{Int}, Float64}}()
            for path in yen_k_result.paths
                path_weight = Statistics.mean([kmer_total_quality[ordered_kmers[node]] for node in path])
                path_transition_likelihoods = 1.0
                for (a, b) in zip(path[1:end-1], path[2:end])
                    path_transition_likelihoods *= transition_likelihoods[a, b]
                end
                joint_weight = path_weight * path_transition_likelihoods
                push!(yen_k_shortest_paths_and_weights[u => v], path => joint_weight)
            end
        end
        yen_k_path_weights = yen_k_shortest_paths_and_weights[u => v]      
        if length(yen_k_path_weights) > 1
            current_distance = length(resampling_stretch)
            initial_weights = last.(yen_k_path_weights)
            path_lengths = length.(first.(yen_k_path_weights))
            deltas = map(l -> abs(l-current_distance), path_lengths)
            adjusted_weights = initial_weights .* map(d -> exp(-d * log(2)), deltas)
            # make it more severe?
            # adjusted_weights = adjusted_weights.^2
            
            # and a bonus for usually being correct
            
            selected_path_index = StatsBase.sample(StatsBase.weights(adjusted_weights))
            selected_path, selected_path_weights = yen_k_path_weights[selected_path_index]
            selected_path_kmers = [ordered_kmers[kmer_index] for kmer_index in selected_path]
            
            if last(new_record_kmers) == first(selected_path_kmers)
                selected_path_kmers = selected_path_kmers[2:end]
            end
            append!(new_record_kmers, selected_path_kmers)
            selected_kmer_qualities = [Int8.(min.(typemax(Int8), floor.(kmer_mean_quality[kmer]))) for kmer in selected_path_kmers]
            append!(new_record_kmer_qualities, selected_kmer_qualities)
        else
            selected_path_kmers = record_kmers[resampling_stretch]
            if last(new_record_kmers) == first(selected_path_kmers)
                selected_path_kmers = selected_path_kmers[2:end]
            end
            append!(new_record_kmers, selected_path_kmers)
            selected_kmer_qualities = [Int8.(min.(typemax(Int8), floor.(kmer_mean_quality[kmer]))) for kmer in selected_path_kmers]
            append!(new_record_kmer_qualities, selected_kmer_qualities)
        end
        if i < length(resampling_stretches) # append high quality gap
            next_solid_start = last(resampling_stretch)+1
            next_resampling_stretch = resampling_stretches[i+1]
            next_solid_stop = first(next_resampling_stretch)-1
            if !isempty(next_solid_start:next_solid_stop)
                selected_path_kmers = record_kmers[next_solid_start:next_solid_stop]
                append!(new_record_kmers, selected_path_kmers)
                selected_kmer_qualities = record_kmer_quality_scores[next_solid_start:next_solid_stop]
                append!(new_record_kmer_qualities, selected_kmer_qualities)
            end
        else # append remainder of sequence
            @assert i == length(resampling_stretches)
            next_solid_start = last(resampling_stretch)+1
            if next_solid_start < length(record_kmers)
                selected_path_kmers = record_kmers[next_solid_start:end]
                append!(new_record_kmers, selected_path_kmers)
                selected_kmer_qualities = record_kmer_quality_scores[next_solid_start:end]
                append!(new_record_kmer_qualities, selected_kmer_qualities)
            end
        end
    end
    
    for (a, b) in zip(new_record_kmers[1:end-1], new_record_kmers[2:end])
        @assert a != b
    end
    new_record_sequence = Mycelia.kmer_path_to_sequence(new_record_kmers)
    new_record_quality_scores = new_record_kmer_qualities[1]
    for new_record_kmer_quality in new_record_kmer_qualities[2:end]
        push!(new_record_quality_scores, last(new_record_kmer_quality))
    end
    new_record = fastq_record(identifier=new_record_identifier, sequence=new_record_sequence, quality_scores=new_record_quality_scores)
    return new_record
end

In [None]:
function polish_fastq(;fastq, k=1)
    kmer_graph = build_directed_kmer_graph(fastq=fastq, k=k)
    assembly_k = MetaGraphs.get_prop(kmer_graph, :assembly_k)
    @info "polishing with k = $(assembly_k)"
    revised_records = []
    yen_k_shortest_paths_and_weights = Dict{Pair{Int, Int}, Vector{Pair{Vector{Int}, Float64}}}()
    ProgressMeter.@showprogress for record in collect(Mycelia.open_fastx(fastq))
        revised_record = process_fastq_record(;record, kmer_graph, yen_k_shortest_paths_and_weights)
        push!(revised_records, revised_record)
    end
    
    fastq_out = replace(fastq, Mycelia.FASTQ_REGEX => ".k$(assembly_k).fq")
    open(fastq_out, "w") do io
        fastx_io = FASTX.FASTQ.Writer(io)
        for record in revised_records
            write(fastx_io, record)
        end
        close(fastx_io)
    end
    run(`gzip --force $(fastq_out)`)
    return (fastq = fastq_out * ".gz", k=assembly_k)
end

In [None]:
function build_directed_kmer_graph(;fastq, k=1, plot=true)
    if k == 1
        assembly_k = Mycelia.assess_dnamer_saturation([fastq])
    else
        @assert isodd(k)
        @assert Primes.isprime(k)
        assembly_k = k
    end
    kmer_type = Kmers.DNAKmer{assembly_k}

    # initializing the graph with kmer counts
    kmer_counts = Mycelia.count_kmers(kmer_type, fastq)
    ordered_kmers = collect(keys(kmer_counts))
    total_states = length(ordered_kmers)
    graph = MetaGraphs.MetaDiGraph(total_states)
    MetaGraphs.set_prop!(graph, :assembly_k, assembly_k)
    MetaGraphs.set_prop!(graph, :kmer_counts, kmer_counts)
    MetaGraphs.set_prop!(graph, :total_states, total_states)
    MetaGraphs.set_prop!(graph, :ordered_kmers, ordered_kmers)
    kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(kmer_counts)))
    MetaGraphs.set_prop!(graph, :kmer_indices, kmer_indices)
    canonical_kmer_counts = Mycelia.count_canonical_kmers(kmer_type, fastq)
    MetaGraphs.set_prop!(graph, :canonical_kmer_counts, canonical_kmer_counts)
    canonical_kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(canonical_kmer_counts)))
    MetaGraphs.set_prop!(graph, :canonical_kmer_indices, canonical_kmer_indices)
    
    
    # kmer quality and likelihoods
    
    records = collect(Mycelia.open_fastx(fastq))
    read_quality_scores = [collect(FASTX.quality_scores(record)) for record in records]
    all_kmer_quality_support = Dict{kmer_type, Vector{Float64}}()
    for record in records
        record_quality_scores = collect(FASTX.quality_scores(record))
        record_quality_score_slices = [record_quality_scores[i:i+assembly_k-1] for i in 1:length(record_quality_scores)-assembly_k+1]
        sequence = BioSequences.LongDNA{2}(FASTX.sequence(record))
        for ((i, kmer), kmer_base_qualities) in zip(Kmers.EveryKmer{kmer_type}(sequence), record_quality_score_slices)
            if haskey(all_kmer_quality_support, kmer)
                all_kmer_quality_support[kmer] = all_kmer_quality_support[kmer] .+ kmer_base_qualities
            else
                all_kmer_quality_support[kmer] = kmer_base_qualities
            end
        end
    end
    
    # strand normalization shares observational quality across strands - only relevant for non-stranded DNA genome assembly
    strand_normalized_quality_support = Dict{kmer_type, Vector{Float64}}()
    for (kmer, support) in all_kmer_quality_support
        strand_normalized_quality_support[kmer] = support
        if haskey(all_kmer_quality_support, BioSequences.reverse_complement(kmer))
            strand_normalized_quality_support[kmer] .+= all_kmer_quality_support[BioSequences.reverse_complement(kmer)]
        end
    end
    strand_normalized_quality_support
    kmer_mean_quality = sort(Dict(kmer => strand_normalized_quality_support[kmer] ./ canonical_kmer_counts[BioSequences.canonical(kmer)] for kmer in ordered_kmers))
    MetaGraphs.set_prop!(graph, :kmer_mean_quality, kmer_mean_quality)
    kmer_total_quality = sort(Dict(kmer => sum(quality_values) for (kmer, quality_values) in strand_normalized_quality_support))
    MetaGraphs.set_prop!(graph, :kmer_total_quality, kmer_total_quality)
    state_likelihoods = sort(Dict(kmer => total_quality / sum(values(kmer_total_quality)) for (kmer, total_quality) in kmer_total_quality))
    MetaGraphs.set_prop!(graph, :state_likelihoods, state_likelihoods)


    # all transition likelihood calculation
    transition_likelihoods = SparseArrays.spzeros(total_states, total_states)
    for record in records
        sequence = BioSequences.LongDNA{4}(FASTX.sequence(record))
        sources = Kmers.EveryKmer{kmer_type}(sequence[1:end-1])
        destinations = Kmers.EveryKmer{kmer_type}(sequence[2:end])
        for ((source_i, source), (destination_i, destination)) in zip(sources, destinations)
            source_index = kmer_indices[source]
            destination_index = kmer_indices[destination]
            transition_likelihoods[source_index, destination_index] += 1
        end
    end
    for source in 1:total_states
        outgoing_transition_counts = transition_likelihoods[source, :]
        if sum(outgoing_transition_counts) > 0
            transition_likelihoods[source, :] .= transition_likelihoods[source, :] ./ sum(transition_likelihoods[source, :]) 
        end
    end
    row_indices, column_indices, cell_values = SparseArrays.findnz(transition_likelihoods)
    for (row, col, value) in zip(row_indices, column_indices, cell_values)
        Graphs.add_edge!(graph, row, col)
        MetaGraphs.set_prop!(graph, row, col, :transition_likelihood, value)
    end
    MetaGraphs.set_prop!(graph, :transition_likelihoods, transition_likelihoods)

    # helpful for downstream processing
    unbranching_nodes = Set(Int[])
    for node in Graphs.vertices(graph)
        if (Graphs.indegree(graph, node) <= 1) && (Graphs.outdegree(graph, node) <= 1)
            push!(unbranching_nodes, node)
        end
    end
    branching_nodes = Set(setdiff(Graphs.vertices(graph), unbranching_nodes))
    MetaGraphs.set_prop!(graph, :unbranching_nodes, unbranching_nodes)
    MetaGraphs.set_prop!(graph, :branching_nodes, branching_nodes)
    
    
    # total_strand_normalized_quality_support = sum.(collect(values(strand_normalized_quality_support)))
    mean_total_support = Statistics.mean(collect(values(kmer_total_quality)))
    sorted_kmer_total_quality_values = collect(values(kmer_total_quality))
    mean_quality_value = Statistics.mean(sorted_kmer_total_quality_values)
    threshold = mean_quality_value

    xs = [
        [i for (i, y) in enumerate(sorted_kmer_total_quality_values) if y > threshold],
        [i for (i, y) in enumerate(sorted_kmer_total_quality_values) if y <= threshold]
        ]
    
    likely_valid_kmer_indices = xs[1]
    MetaGraphs.set_prop!(graph, :likely_valid_kmer_indices, likely_valid_kmer_indices)
    likely_sequencing_artifact_indices = xs[2]
    MetaGraphs.set_prop!(graph, :likely_sequencing_artifact_indices, likely_sequencing_artifact_indices)
    # likely_sequencing_artifact_kmers = Set(ordered_kmers[likely_sequencing_artifact_indices])
    # likely_valid_kmers = Set(ordered_kmers[likely_valid_kmer_indices])
    # kmer_to_index_map = Dict(kmer => i for (i, kmer) in enumerate(ordered_kmers))
    
    
    if plot
        ys = [
            [y for y in sorted_kmer_total_quality_values if y > threshold],
            [y for y in sorted_kmer_total_quality_values if y <= threshold]
        ]

        p = StatsPlots.scatter(
            xs,
            ys,
            title = "kmer qualities",
            ylabel = "canonical kmer cumulative QUAL value",
            label = ["above" "below"],
            legend = :outertopright,
            # size = (900, 500),
            margins=10StatsPlots.Plots.PlotMeasures.mm,
            xticks = false
        )
        p = StatsPlots.hline!(p, [mean_quality_value], label="mean")
        display(p)
    end
    return graph
end

In [None]:
# selected after trialing previous and next ks and finding those to be too unstable
function iterative_polishing(fastq, max_k = 89)
    # initial polishing
    polishing_results = [polish_fastq(fastq=fastq)]
    while (!ismissing(last(polishing_results).k)) && (last(polishing_results).k < max_k)
        next_k = first(filter(k -> k > last(polishing_results).k, Mycelia.ks()))
        # @show next_k
        push!(polishing_results, polish_fastq(fastq=last(polishing_results).fastq, k=next_k))
    end
    return polishing_results
end

In [None]:
# polishing_results = iterative_polishing(fastq)

In [None]:
average_read_length = Statistics.mean(length.(FASTX.sequence.(collect(Mycelia.open_fastx(assembly_fastq)))))

In [None]:
assembly_k = last(filter(x -> x <= average_read_length/2, Mycelia.ks()))

In [None]:
# assembly_k = last(polishing_results).k
# assembly_fastq = last(polishing_results).fastq
# # kmer_graph = build_directed_kmer_graph(fastq=assembly_fastq, k=assembly_k)

# restart point
assembly_fastq = "/global/cfs/cdirs/m4269/cjprybol/Mycelia/projects/variant-calling-benchmarking/data/genomes/fF3EHHs.fna.normalized.vcf.fna.badread.10x.filtlong.k11.k13.k17.k19.k23.k31.k53.k89.fq.gz"
# assembly_k = 89

In [None]:
kmer_graph = Mycelia.fastx_to_kmer_graph(Kmers.DNAKmer{assembly_k}, [assembly_fastq])

In [None]:
StatsBase.countmap([kmer_graph.vprops[v][:count] for v in Graphs.vertices(kmer_graph)])

In [None]:
# heuristic - should be based on something better or removed altogether
max_filter = 5

initial_connected_components = length(Graphs.connected_components(kmer_graph))
coverage_threshold = 1
candidate_subgraph, vertex_map = Graphs.induced_subgraph(kmer_graph, [v for v in Graphs.vertices(kmer_graph) if kmer_graph.vprops[v][:count] > coverage_threshold])
while (length(Graphs.connected_components(candidate_subgraph)) == initial_connected_components) && (coverage_threshold < max_filter)
    coverage_threshold += 1
    candidate_subgraph, vertex_map = Graphs.induced_subgraph(kmer_graph, [v for v in Graphs.vertices(kmer_graph) if kmer_graph.vprops[v][:count] > coverage_threshold])
end
@show coverage_threshold

candidate_subgraph, vertex_map = Graphs.induced_subgraph(kmer_graph, [v for v in Graphs.vertices(kmer_graph) if kmer_graph.vprops[v][:count] >= coverage_threshold])

In [None]:
candidate_subgraph.vprops

In [None]:
# import Graphs

function find_unbranching_walks(undirected_graph)
    unbranching_walks = Vector{Vector{Int}}()
    visited = falses(Graphs.nv(undirected_graph))
    
    for v in Graphs.vertices(undirected_graph)
        if !visited[v]
            if Graphs.degree(undirected_graph, v) == 1 || Graphs.degree(undirected_graph, v) > 2
                # Start a walk from a terminal vertex or branch point
                walk = [v]
                visited[v] = true
                
                # Traverse in one direction
                current = v
                while Graphs.degree(undirected_graph, current) == 2 && any(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))
                    current = Graphs.neighbors(undirected_graph, current)[findfirst(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))]
                    push!(walk, current)
                    visited[current] = true
                end
                
                # Traverse in the opposite direction
                current = v
                while Graphs.degree(undirected_graph, current) == 2 && any(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))
                    current = Graphs.neighbors(undirected_graph, current)[findfirst(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))]
                    pushfirst!(walk, current)
                    visited[current] = true
                end
                
                push!(unbranching_walks, walk)
            elseif Graphs.degree(undirected_graph, v) == 2
                # Start a walk from a non-branch point
                walk = [v]
                visited[v] = true
                
                # Traverse in one direction
                current = v
                while Graphs.degree(undirected_graph, current) == 2 && any(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))
                    current = Graphs.neighbors(undirected_graph, current)[findfirst(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))]
                    push!(walk, current)
                    visited[current] = true
                end
                push!(walk, current)
                
                # Traverse in the opposite direction
                current = v
                while Graphs.degree(undirected_graph, current) == 2 && any(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))
                    current = Graphs.neighbors(undirected_graph, current)[findfirst(!visited[neighbor] for neighbor in Graphs.neighbors(undirected_graph, current))]
                    pushfirst!(walk, current)
                    visited[current] = true
                end
                pushfirst!(walk, current)
                
                push!(unbranching_walks, walk)
            end
        end
    end    
    return unbranching_walks
end

In [None]:
unbranching_walks = find_unbranching_walks(candidate_subgraph)
# quirk
unbranching_walks = [ubw[2:end-1] for ubw in unbranching_walks]

In [None]:
# kmer_slice(kmer, slice) = [kmer[i] for i in slice]

# function reorient_kmers(kmers)
#     oriented_kmers = []
    
#     for i in 1:length(kmers)-1
#         kmer1 = kmers[i]
#         kmer2 = kmers[i+1]
        
#         if all(kmer_slice(kmer1, 2:k) .== kmer_slice(kmer2, 1:k-1)
#             push!(oriented_kmers, kmer1)
#         else
#             push!(oriented_kmers, BioSequences.reverse_complement(kmer1))
#         end
#     end
    
#     last_kmer = kmers[end]
#     if startswith(last_kmer, oriented_kmers[end][2:end])
#         push!(oriented_kmers, last_kmer)
#     else
#         push!(oriented_kmers, BioSequences.reverse_complement(last_kmer))
#     end
    
#     return oriented_kmers
# end

In [None]:
walk_kmers = [candidate_subgraph.vprops[v][:kmer] for v in unbranching_walks[1]]

In [None]:
walk_sequence = BioSequences.LongDNA{2}(walk_kmers[1])

In [None]:
k = assembly_k

In [None]:
for walk_kmer in walk_kmers[2:2]
    
    
    
    walk_kmer_sequence = BioSequences.LongDNA{2}(walk_kmer)
    if walk_sequence[end-k+1:end] == walk_kmer_sequence[1:end-1]
        @show "match"
    elseif walk_sequence[end-k+1:end] == BioSequences.reverse_complement(walk_kmer_sequence)[1:end-1]
        @show "alt_match"
    end
end

In [None]:
reorient_kmers([])

In [None]:
# vertex_map

In [None]:
# StatsBase.countmap([Graphs.degree(candidate_subgraph, v) for v in Graphs.vertices(candidate_subgraph)])

In [None]:
# sorted_unbranching_walks = sort(find_unbranching_walks(candidate_subgraph), by=x->length(x), rev=true)

In [None]:
# length.(sorted_unbranching_walks)

In [None]:
# candidate_subgraph

In [None]:
# StatsBase.countmap([Graphs.degree(kmer_graph, v) for v in Graphs.vertices(kmer_graph)])

In [None]:
# ends = [v for v in Graphs.vertices(candidate_subgraph) if Graphs.degree(candidate_subgraph, v) == 1]

In [None]:
# no branch points!
# StatsBase.countmap(map(v -> Graphs.degree(candidate_subgraph, v), Graphs.vertices(candidate_subgraph)))

In [None]:
# candidate_subgraph

In [None]:
# solve later
# branch_points = [v for v in Graphs.vertices(candidate_subgraph) if Graphs.degree(candidate_subgraph, v) >= 3]
# branch_point_shortest_paths = Array{Vector{Int}}(undef, length(branch_points), length(branch_points))

In [None]:
# Graphs.enumerate_paths(Graphs.dijkstra_shortest_paths(kmer_graph, ends[1]), ends[2])

In [None]:
# function point_to_point_shortest_path(graph, source, source)
#     Graphs.enumerate_paths(Graphs.dijkstra_shortest_paths(kmer_graph, end_1), end_2)
# end

In [None]:
# graphs

In [None]:
# dijkstras_end_1, ends[2])

In [None]:
# candidate_subgraph

In [None]:
# yen_k_shortest_paths(g, source, target,  distmx=weights(g), K=1; maxdist=typemax(T))

In [None]:
# dijkstras_end_2 = Graphs.dijkstra_shortest_paths(kmer_graph, ends[2])

In [None]:
# dijkstras_end_3 = Graphs.dijkstra_shortest_paths(kmer_graph, ends[3])

In [None]:
# findmax(dijkstras_end_1.dists)

In [None]:
# findmax(dijkstras_end_2.dists)

In [None]:
# Graphs.enumerate_paths(dijkstras_end_1, ends[2])
# StatsBase.countmap([kmer_graph.vprops[v][:count] for v in Graphs.enumerate_paths(dijkstras_end_1, ends[2])])
# [kmer_graph.vprops[v][:kmer] for v in Graphs.enumerate_paths(dijkstras_end_1, ends[2])]

In [None]:
# kmer_graph.gprops

In [None]:
# ?Mycelia.path_to_sequence

In [None]:
# Graphs.topological_sort_by_dfs(candidate_subgraph)

In [None]:
# connected_components = Graphs.connected_components(kmer_graph)

In [None]:
# minimum_component_value, minimum_component_index = findmin(sum.(Graphs.connected_components(kmer_graph)))

In [None]:
# topological_ordering = Graphs.topological_sort(kmer_graph)
# Graphs.topological_sort_by_dfs(kmer_graph)

In [None]:
# connected_component = first(connected_components)

In [None]:
# for e in Graphs.edges(kmer_graph)
#     # @show e
#     # @show MetaGraphs.props(kmer_graph, e)
#     transition_likelihood = MetaGraphs.get_prop(kmer_graph, e, :transition_likelihood)
#     MetaGraphs.set_prop!(kmer_graph, e, :weight, 1/transition_likelihood)
# end

In [None]:
# size_sorted_components = sort(Graphs.connected_components(kmer_graph), by=x->length(x), rev=true)

In [None]:
# larger_component = size_sorted_components[1]
# smaller_component = size_sorted_components[2]
# smaller_component_kmers = MetaGraphs.get_prop(kmer_graph, :ordered_kmers)[smaller_component]


In [None]:
# count(ismissing, [get(MetaGraphs.get_prop(kmer_graph, :kmer_indices), BioSequences.reverse_complement(kmer), missing) for kmer in smaller_component_kmers])

In [None]:
# 

In [None]:
# induced_subraphs_and_node_maps = [Graphs.induced_subgraph(kmer_graph, connected_component) for connected_component in Graphs.connected_components(kmer_graph)]
# largest components first
# sort!(induced_subraphs_and_node_maps, by=x->Graphs.nv(x[1]), rev=true)

In [None]:
# (induced_subgraph, node_map) = first(induced_subraphs_and_node_maps)

In [None]:
# sorted_subgraph = Graphs.topological_sort_by_dfs(induced_subgraph)

In [None]:
# source = first(sorted_subgraph)

In [None]:

# 
# sink = last(sorted_subgraph)

In [None]:
# # Find the shortest path using Dijkstra's algorithm
# shortest_path = Graphs.dijkstra_shortest_paths(induced_subgraph, source)

In [None]:
# Extract the maximum likelihood path
# max_likelihood_path = Graphs.enumerate_paths(Graphs.dijkstra_shortest_paths(induced_subgraph, source), sink)

In [None]:


# # Extract the maximum likelihood path
# max_likelihood_path = enumerate_paths(dijkstra_shortest_paths(g, source), sink)

In [None]:
# Find the shortest path using Dijkstra's algorithm
# shortest_path = dijkstra_shortest_paths(g, source, sink)

# Extract the maximum likelihood path
# max_likelihood_path = enumerate_paths(dijkstra_shortest_paths(g, source), sink)

In [None]:
# only works for undirected graphs
# Graphs.is_eulerian(kmer_graph)
# Graphs.is_cyclic(kmer_graph)

In [None]:
# Graphs.saw(kmer_graph)

In [None]:
# component_subgraph, component_subgraph_vertex_map = Graphs.induced_subgraph()

In [None]:
# Graphs.topological_sort(kmer_graph)

In [None]:
# Graphs.topological_sort_by_dfs(kmer_graph)

In [None]:
# Graphs.eulerian(kmer_graph)

In [None]:
# Graphs.is_cyclic(kmer_graph)

In [None]:
# MetaGraphs.get_prop(kmer_graph, :transition_likelihoods)

In [None]:
# transition_likelihoods = MetaGraphs.get_prop(kmer_graph, :transition_likelihoods)
# edge_distances = map(x -> x == 0 ? zero(x) : 1/x, transition_likelihoods)

In [None]:
# Graphs.dag_longest_path(kmer_graph, transition_likelihoods)
# Graphs.saw(kmer_graph)
# randomwalk(g, s, niter; seed=-1)
# non_backtracking_randomwalk(g, s, niter; seed=-1)
# self_avoiding_walk(g, s, niter; seed=-1)
# dag_longest_path(g, distmx=weights(g); topological_order=topological_sort_by_dfs(g))

In [None]:
# kmer_graph.gprops

In [None]:
# # heuristic - should be based on something better or removed altogether
# max_filter = 5

# initial_connected_components = length(Graphs.connected_components(kmer_graph))
# coverage_threshold = 1
# candidate_subgraph, vertex_map = Graphs.induced_subgraph(kmer_graph, [v for v in Graphs.vertices(kmer_graph) if kmer_graph.vprops[v][:count] > coverage_threshold])
# while (length(Graphs.connected_components(candidate_subgraph)) == initial_connected_components) && (coverage_threshold < max_filter)
#     coverage_threshold += 1
#     candidate_subgraph, vertex_map = Graphs.induced_subgraph(kmer_graph, [v for v in Graphs.vertices(kmer_graph) if kmer_graph.vprops[v][:count] > coverage_threshold])
# end
# @show coverage_threshold


# candidate_subgraph, vertex_map = Graphs.induced_subgraph(kmer_graph, [v for v in Graphs.vertices(kmer_graph) if kmer_graph.vprops[v][:count] >= coverage_threshold])

In [None]:
# component_subgraphs_and_vertex_maps = [Graphs.induced_subgraph(candidate_subgraph, connected_component) for connected_component in Graphs.connected_components(candidate_subgraph)]

In [None]:
# component_subgraph, component_subgraph_vertex_map = first(component_subgraphs_and_vertex_maps)

In [None]:
# Graphs.topological_sort(component_subgraph)

In [None]:
# Graphs.topological_sort_by_dfs(component_subgraph)

In [None]:
# Graphs.eulerian(component_subgraph)

In [None]:
# Graphs.is_cyclic(component_subgraph)

In [None]:
# Graphs.saw(component_subgraph)

In [None]:
# component_subgraph

In [None]:
# component_subgraph_vertex_map

In [None]:
# write out final assembly

In [None]:
# call variants

In [None]:
# assess accuracy