In [1]:
DATE = "2021-06-20"
TASK = "100bp-10x-coverage-0.01-error-rate-full"
DIR = "$(DATE)-$(TASK)"
DIR = mkpath("$(homedir())/$(DIR)")

"/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full"

In [2]:
pkgs = [
"LightGraphs",
"MetaGraphs",
"BioSequences",
"uCSV",
"DataFrames",
"FASTX",
"HTTP",
"CodecZlib",
"DataStructures",
"Revise",
"ProgressMeter",
"BenchmarkTools",
"Random",
"StatsBase",
"SparseArrays",
"Statistics",
"BioAlignments",
"NumericIO",
"Primes"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia

In [3]:
function q_value_to_error_rate(q_value)
    error_rate = 10^(q_value/(-10))
    return error_rate
end

function error_rate_to_q_value(error_rate)
    q_value = -10 * log10(error_rate)
    return q_value
end

error_rate_to_q_value (generic function with 1 method)

In [4]:
function initialize_transition_probabilities(kmer_graph)
    
    total_kmers = LightGraphs.nv(kmer_graph)
    transition_likelihoods = Dict(
        true => SparseArrays.spzeros(total_kmers, total_kmers),
        false => SparseArrays.spzeros(total_kmers, total_kmers)
    )

    for edge in collect(LightGraphs.edges(kmer_graph))
        weight = length(kmer_graph.eprops[edge][:evidence])
        for o in kmer_graph.eprops[edge][:orientations]
            transition_likelihoods[o.source_orientation][edge.src, edge.dst] = weight
        end
    end

    for source_orientation in (true, false)
        for src in 1:total_kmers
            transition_weights = transition_likelihoods[source_orientation][src, :]
            total_weight = sum(transition_weights)
            dsts, vals = SparseArrays.findnz(transition_weights)
            for (dst, val) in zip(dsts, vals) 
                transition_likelihoods[source_orientation][src, dst] = val / total_weight
            end
            normalized_probability = sum(transition_likelihoods[source_orientation][src, :])
            @assert isapprox(normalized_probability, 0) || isapprox(normalized_probability, 1)
        end
    end
    return transition_likelihoods
end

initialize_transition_probabilities (generic function with 1 method)

In [5]:
function set_initial_state_likelihoods!(
        kmer_graph,
        initial_state,
        kmer_likelihoods,
        error_rate,
        state_likelihoods,
        arrival_paths
    )
    for vertex in collect(LightGraphs.vertices(kmer_graph))
        hidden_kmer = kmer_graph.vprops[vertex][:kmer]

        fw_alignment = 
            BioAlignments.pairalign(
                BioAlignments.LevenshteinDistance(), 
                initial_state.fw, 
                hidden_kmer)

        fw_probability = kmer_likelihoods[vertex]

        for match in 1:BioAlignments.count_matches(BioAlignments.alignment(fw_alignment))
            fw_probability *= 1 - error_rate
        end

        for edit in 1:fw_alignment.value
            fw_probability *= error_rate
        end

        bw_alignment = 
            BioAlignments.pairalign(
                BioAlignments.LevenshteinDistance(),
                initial_state.bw,
                hidden_kmer)

        bw_probability = kmer_likelihoods[vertex]

        for match in 1:BioAlignments.count_matches(BioAlignments.alignment(bw_alignment))
            bw_probability *= 1 - error_rate
        end

        for edit in 1:bw_alignment.value
            bw_probability *= error_rate
        end

        if fw_probability > bw_probability
            state_probability = fw_probability
            state_orientation = true
        elseif fw_probability < bw_probability
            state_probability = bw_probability
            state_orientation = false
        else fw_probability == bw_probability
            state_probability = fw_probability
            state_orientation = missing
        end
        state_likelihoods[vertex, 1] = state_probability
        arrival_paths[vertex, 1] = [vertex => state_orientation]
    end
end

set_initial_state_likelihoods! (generic function with 1 method)

In [6]:
function oriented_path_to_sequence(kmer_graph, oriented_path)
    initial_kmer, initial_orientation = first(oriented_path)
    k = kmer_graph.gprops[:k]
    sequence = BioSequences.LongDNASeq(kmer_graph.vprops[initial_kmer][:kmer])
    if !initial_orientation
        sequence = BioSequences.reverse_complement(sequence)
    end
    for (kmer, orientation) in oriented_path[2:end]
        kmer_seq = BioSequences.LongDNASeq(kmer_graph.vprops[kmer][:kmer])
        if !orientation
            kmer_seq = BioSequences.reverse_complement(kmer_seq)
        end
        if sequence[end-k+2:end] != kmer_seq[1:end-1]
            @show sequence[end-k+2:end]
            @show kmer_seq[1:end-1]
            @error ""
        end
#         @assert sequence[end-k+2:end] == kmer_seq[1:end-1]
        push!(sequence, last(kmer_seq))
    end
    return sequence
end

oriented_path_to_sequence (generic function with 1 method)

In [7]:
function sequence_to_oriented_path(sequence, kmers::Vector{T}) where {T <: BioSequences.AbstractMer{A, K}} where {A, K}
    observed_path = Vector{Pair{Int, Bool}}(undef, length(sequence)-K+1)
    for (i, kmer) in enumerate(BioSequences.each(T, sequence))
        canonical_kmer = BioSequences.canonical(kmer.fw)
        index = Mycelia.get_kmer_index(kmers, canonical_kmer)
        orientation = kmer.fw == canonical_kmer
        observed_path[i] = index => orientation
    end
    return observed_path
end

sequence_to_oriented_path (generic function with 1 method)

In [8]:
# they way this is currently implemented, it can't handle kmers not present in the graph
function oriented_path_to_likelihood(kmer_graph, kmers, kmer_likelihoods, transition_likelihoods, fastq_record)
    original_oriented_path = sequence_to_oriented_path(FASTX.sequence(fastq_record), kmers)
    path_likelihood = kmer_likelihoods[first(first(original_oriented_path))]

    for q_value in FASTX.quality(fastq_record)[1:kmer_graph.gprops[:k]]
        error_rate = q_value_to_error_rate(q_value)
        accuracy = 1 - error_rate
        path_likelihood *= accuracy
    end

    for index in 2:length(original_oriented_path)
        kmer, orientation = original_oriented_path[index]
        prior_kmer, prior_orientation = original_oriented_path[index-1]
        state_likelihood = kmer_likelihoods[kmer]
        transition_likelihood = transition_likelihoods[prior_orientation][prior_kmer, kmer]
        sequence_index = kmer_graph.gprops[:k] + index - 1
        state_q_value = FASTX.quality(fastq_record)[sequence_index]
        state_error_likelihood = q_value_to_error_rate(state_q_value)
        emission_likelihood = 1 - state_error_likelihood 
        path_likelihood *= state_likelihood * transition_likelihood * emission_likelihood
    end
    return path_likelihood
end

oriented_path_to_likelihood (generic function with 1 method)

In [31]:
function polish_fastq(kmer_graph, fastq_file)

#     @info "Assessing kmer likelihoods"
    kmers = [kmer_graph.vprops[v][:kmer] for v in LightGraphs.vertices(kmer_graph)]
    kmer_counts = [length(kmer_graph.vprops[v][:evidence]) for v in LightGraphs.vertices(kmer_graph)]
    kmer_likelihoods = kmer_counts ./ sum(kmer_counts)
    kmer_type = BioSequences.BigDNAMer{kmer_graph.gprops[:k]}
    total_kmers = length(kmers)
    
#     @info "determining shortest paths between kmers"
    shortest_paths = LightGraphs.enumerate_paths(LightGraphs.floyd_warshall_shortest_paths(kmer_graph));

    
#     @info "counting the number of records to establish runtime estimate"
    number_of_records = 0
    for fastq_record in FASTX.FASTQ.Reader(open(fastq_file))
        number_of_records += 1
    end
    progress_bar = ProgressMeter.Progress(number_of_records, 1)
    
    output_fastq_file = replace(fastq_file, ".fastq" => ".k$(kmer_graph.gprops[:k]).fastq")
    fastq_writer = FASTX.FASTQ.Writer(open(output_fastq_file, "w"))
    for fastq_record in FASTX.FASTQ.Reader(open(fastq_file))
        ProgressMeter.next!(progress_bar)
        
#         @info "Initializing matrices"
        total_states = length(FASTX.sequence(fastq_record))-k+1
        transition_likelihoods = initialize_transition_probabilities(kmer_graph)
        state_likelihoods = zeros(total_kmers, total_states)
        arrival_paths = fill(Pair{Int, Union{Bool, Missing}}[], total_kmers, total_states)

#         @info "Determining Likelihoods of initial states"
        initial_state = first(BioSequences.each(kmer_type, FASTX.sequence(fastq_record)))
        current_state = 1
        # note this is a place for potential improvement, use the q value at each base to guide probability rather than median
        median_q_value = Statistics.median(Int.(FASTX.quality(fastq_record)[1:k]))
        current_error_rate = q_value_to_error_rate(median_q_value)
        # canonical_kmer = BioSequences.canonical(initial_state.fw)
        set_initial_state_likelihoods!(
                kmer_graph,
                initial_state,
                kmer_likelihoods,
                error_rate,
                state_likelihoods,
                arrival_paths
            )

#         @info "Determining likelihood of downstream states"

        non_singleton_states = findall(kmer_counts .> 1)

        ProgressMeter.@showprogress for current_state in 2:total_states
            prior_state = current_state - 1

        #     observed_kmer = BioSequences.BigDNAMer{k}(FASTX.sequence(fastq_record)[current_state:current_state+k-1])

        #     @assert observed_kmer == collect(BioSequences.each(kmer_type, FASTX.sequence(fastq_record)))[current_state].fw

        #     canonical_kmer = BioSequences.canonical(observed_kmer)

            observed_nucleotide = FASTX.sequence(fastq_record)[k-1+current_state]
        #     observed_nucleotide = last(observed_kmer)
            observed_quality_score = FASTX.quality(fastq_record)[k-1+current_state]
            observed_error_rate = q_value_to_error_rate(observed_quality_score)

            # we'll assess prior states in order of decreasing likelihood
            # such that we maximize how frequently we are able to utilize the
            # current_state_likelihood > candidate prior state
            # break that won't bother evaluating lower likelihood possibilities
            prior_states_in_decreasing_likelihood = sortperm(state_likelihoods[:, prior_state], rev=true)

            # and skip all prior states with zero probability

            for current_vertex in non_singleton_states
                for prior_vertex in prior_states_in_decreasing_likelihood
                    if state_likelihoods[prior_vertex, prior_state] > 0
                        run_viterbi!(
                                current_state,
                                prior_state,
                                observed_nucleotide,
                                observed_quality_score,
                                observed_error_rate,
                                current_vertex,
                                prior_vertex,
                                state_likelihoods,
                                transition_likelihoods,
                                shortest_paths,
                                arrival_paths,
                                kmer_graph,
                                kmer_likelihoods
                                )
                    end
                end
            end
        end

#         try
        maximum_likelihood_path, maximum_likelihood_value = 
            determine_maximum_likelihood_path(
                state_likelihoods,
                arrival_paths
                )
#         catch
#             return state_likelihoods, arrival_paths
#         end

        sequence = oriented_path_to_sequence(kmer_graph, maximum_likelihood_path)

#         @info "comparing to original path"
        original_sequence_likelihood = oriented_path_to_likelihood(kmer_graph, kmers, kmer_likelihoods, transition_likelihoods, fastq_record)
        relative_likelihood = maximum_likelihood_value / original_sequence_likelihood
#         relative_likelihood_formatted = NumericIO.formatted(relative_likelihood, ndigits=1, charset=:ASCII)
#         println("relative likelihood of new path to old path is $(relative_likelihood_formatted)")

#         @info "writing updated record"
        identifier = FASTX.identifier(fastq_record) * "_k$(k)"
        description = string(relative_likelihood)
        # because the sequences won't always be the same length, we take an ordered sampling with replacement
        # which introduces some random error but preserves overall patterns and areas of high/low accuracy
        quality_scores = StatsBase.sample(FASTX.quality(fastq_record), length(sequence), ordered=true)

        new_fastq_record = FASTX.FASTQ.Record(
            identifier,
            description,
            sequence,
            quality_scores
        )
        write(fastq_writer, new_fastq_record)
    end
    close(fastq_writer)
    return output_fastq_file
end

polish_fastq (generic function with 1 method)

In [10]:
function determine_maximum_likelihood_path(
    state_likelihoods,
    arrival_paths
    )
    maximum_likelihood_value = maximum(state_likelihoods[:, end])

    maximum_likelihood_path_indices = findall(state_likelihoods[:, end] .== maximum_likelihood_value)

    # if multiple paths are tied, randomly choose one
    maximum_likelihood_path_index = rand(maximum_likelihood_path_indices)

    maximum_likelihood_path = arrival_paths[maximum_likelihood_path_index, end]

    for state_index in size(state_likelihoods, 2)-1:-1:1
        next_kmer, next_orientation = first(maximum_likelihood_path)
        maximum_likelihood_arrival_path = arrival_paths[next_kmer, state_index]
        
        is_match = last(maximum_likelihood_arrival_path) == (next_kmer => next_orientation)
        if !ismissing(is_match) && !is_match
            error("breaking")
        end
        maximum_likelihood_path = vcat(maximum_likelihood_arrival_path[1:end-1], maximum_likelihood_path)
    end
    return maximum_likelihood_path, maximum_likelihood_value
end


determine_maximum_likelihood_path (generic function with 1 method)

In [11]:
function run_viterbi!(
        current_state,
        prior_state,
        observed_nucleotide,
        observed_quality_score,
        observed_error_rate,
        current_vertex,
        prior_vertex,
        state_likelihoods,
        transition_likelihoods,
        shortest_paths,
        arrival_paths,
        kmer_graph,
        kmer_likelihoods
        )
    # if probability of prior state is lower than current probability, skip

#     @show current_state
#     @show prior_state
#     @show current_vertex
#     @show prior_vertex
    
    
    current_state_likelihood = state_likelihoods[current_vertex, current_state]
    prior_state_likelihood = state_likelihoods[prior_vertex, prior_state]

    # if we already have a better possible path, skip calculating anything
    if prior_state_likelihood < current_state_likelihood
#         @show prior_state_likelihood < current_state_likelihood
        return
    end

    # take shortest path and assume it's the maximum likelihood path
    # this assumption seems fair because in an ideal situation
    # we're just moving to an adjacent kmer
    # and the shortest path and most likely path should be the same
    shortest_path = shortest_paths[prior_vertex][current_vertex]
    
#     no path & not considering insertion
    if isempty(shortest_path) && (prior_vertex != current_vertex)
#         @show "no path, skipping"
        return
    end
    
    # if shortest path isn't viable, exit
    if !isempty(shortest_path)
#         @show "checking if path is viable"

        terminal_orientation_prior_state = last(last(arrival_paths[prior_vertex, prior_state]))
#         @show arrival_paths[prior_vertex, prior_state]
#         @show "we were at vertex $(prior_vertex) in orientation $(terminal_orientation_prior_state)"
        candidate_edge = LightGraphs.Edge(shortest_path[1], shortest_path[2])
                
        if !ismissing(terminal_orientation_prior_state) && 
            !any(o -> o.source_orientation == terminal_orientation_prior_state, kmer_graph.eprops[candidate_edge][:orientations])
            
#             @show "no viable orientation matching edges detected between $(candidate_edge)"
#             @show "full candidate path was $(shortest_path)"
#             @show "orientation options were:"
#             @show kmer_graph.eprops[candidate_edge][:orientations]
            return
        end
    end
    
    # zero step path - insertion in observed sequence relative to kmer graph
    is_same_vertex = (current_vertex == prior_vertex)
    has_edge = LightGraphs.has_edge(kmer_graph, LightGraphs.Edge(prior_vertex, current_vertex))
    if is_same_vertex && has_edge
        shortest_path = [prior_vertex, current_vertex]
    end
    
    if is_same_vertex
#         @show "same vertex, considering insertion potential"
        emission_likelihood = observed_error_rate
        transition_likelihood = observed_error_rate
        state_likelihood = kmer_likelihoods[current_vertex]
        path_likelihood = prior_state_likelihood * emission_likelihood * transition_likelihood * state_likelihood
        path = [last(arrival_paths[prior_vertex, prior_state])]

        if current_state_likelihood > state_likelihoods[current_vertex, current_state]
#             @show "selecting path"
#             @show path
#             @show path_likelihood
            state_likelihoods[current_vertex, current_state] = path_likelihood
            arrival_paths[current_vertex, current_state] = path
        end
    # one or more step path - match, mismatch, or deletion in observed sequence relative to kmer graph
    elseif !isempty(shortest_path)
#         @show "path is viable!"
#         @show "considering shortest path: $(shortest_path)"

        initial_path_state = last(arrival_paths[prior_vertex, prior_state])

        path = Vector{typeof(initial_path_state)}(undef, length(shortest_path))
        path[1] = initial_path_state

        path_likelihood::Float64 = state_likelihoods[prior_vertex, prior_state]

        for i in 2:length(shortest_path)

            this_vertex = shortest_path[i]
            prior_vertex, prior_orientation = path[i-1]
            edge = LightGraphs.Edge(prior_vertex, this_vertex)

            possible_edge_orientations::Set{NamedTuple{(:source_orientation, :destination_orientation), Tuple{Bool, Bool}}} = kmer_graph.eprops[edge][:orientations]
            
#             @show possible_edge_orientations
            
            if !ismissing(prior_orientation)
                possible_edge_orientations = filter(o -> o.source_orientation == prior_orientation, possible_edge_orientations)
            end
            
#             @show possible_edge_orientations
            
            if isempty(possible_edge_orientations)
                path_likelihood *= 0.0
                path = Vector{eltype(path)}()
#                 @show "no possible orientations, bailing early"
                break
            end

#             @show prior_orientation
            if ismissing(prior_orientation)
                if transition_likelihoods[true][prior_vertex, this_vertex] > transition_likelihoods[false][prior_vertex, this_vertex]
                    prior_orientation = true
                    transition_likelihood = transition_likelihoods[true][prior_vertex, this_vertex]::Float64
                elseif transition_likelihoods[true][prior_vertex, this_vertex] < transition_likelihoods[false][prior_vertex, this_vertex]
                    prior_orientation = false
                    transition_likelihood = transition_likelihoods[false][prior_vertex, this_vertex]::Float64
                else transition_likelihoods[true][prior_vertex, this_vertex] == transition_likelihoods[false][prior_vertex, this_vertex]
                    prior_orientation = missing
                    transition_likelihood = transition_likelihoods[true][prior_vertex, this_vertex]::Float64
                end
            else
                transition_likelihood = transition_likelihoods[prior_orientation][prior_vertex, this_vertex]::Float64
            end
            state_likelihood::Float64 = kmer_likelihoods[this_vertex]
            path_likelihood *= transition_likelihood * state_likelihood
            
            if length(possible_edge_orientations) == 1
                orientation = first(possible_edge_orientations).destination_orientation
                path[i] = this_vertex => orientation
            else
                path[i] = this_vertex => missing
            end
        end

        # see if new nucleotide is a match or mismatch to terminal kmer in path
        if !isempty(path) && path_likelihood > 0
            terminal_kmer_index, terminal_kmer_orientation = last(path)
            terminal_kmer = BioSequences.LongDNASeq(kmer_graph.vprops[terminal_kmer_index][:kmer])::BioSequences.LongDNASeq
            if ismissing(terminal_kmer_orientation)
                fw_is_match = observed_nucleotide == last(terminal_kmer)
                bw_is_match = observed_nucleotide == last(BioSequences.reverse_complement!(terminal_kmer))
                if fw_ismatch && !bw_is_match
                    path[end] = terminal_kmer_index => true
                    path_likelihood *= 1 - observed_error_rate
                elseif !fw_ismatch && bw_is_match
                    path[end] = terminal_kmer_index => false
                    path_likelihood *= 1 - observed_error_rate
                elseif fw_ismatch && bw_is_match
                    path_likelihood *= 1 - observed_error_rate
                elseif !fw_ismatch && !bw_is_match
                    path_likelihood *= observed_error_rate
                end
            elseif terminal_kmer_orientation
                is_match = observed_nucleotide == last(terminal_kmer)
                if is_match
                    path_likelihood *= 1 - observed_error_rate
                else
                    path_likelihood *= observed_error_rate
                end
            else
                terminal_kmer = BioSequences.reverse_complement!(terminal_kmer)
                is_match = observed_nucleotide == last(terminal_kmer)
                if is_match
                    path_likelihood *= 1 - observed_error_rate
                else
                    path_likelihood *= observed_error_rate
                end
            end
        end

        if path_likelihood > state_likelihoods[current_vertex, current_state]
#             @show "selecting path"
#             @show path
#             @show path_likelihood
            state_likelihoods[current_vertex, current_state] = path_likelihood
            arrival_paths[current_vertex, current_state] = path
        end
    end
    return
end

run_viterbi! (generic function with 1 method)

In [12]:
# set a random seed
seed = Random.seed!(0)

MersenneTwister(0)

In [13]:
# randomly generate a dna sequence of 100bp
genome = BioSequences.randdnaseq(seed, 100)

100nt DNA Sequence:
AAGGGTGCGGTCTAGGTGCACTGCTTATGGTCCCCGACA…TCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT

In [14]:
# define error rate
error_rate = 0.01

0.01

In [15]:
# generate 100x coverage fastq file
# put accuracy rate into fastq file
coverage = 10
fastq_file = "$(DIR)/$(DATE)-$(TASK).fastq"
open(fastq_file, "w") do io
    fastq_writer = FASTX.FASTQ.Writer(io)
    for i in 1:coverage
        observed_sequence = Mycelia.observe(genome, error_rate=error_rate)
        q = -10 * log10(error_rate)
        quality_scores = fill(q, length(observed_sequence))
        fastq_record = FASTX.FASTQ.Record("i", observed_sequence, quality_scores)
        write(fastq_writer, fastq_record)
    end
end

In [16]:
# build weighted kmer graph
# at some point where we may want to record weights as integers rather than lists of evidence, but first confirm the algorithm

In [17]:
# k = Int(1/error_rate + 1)

In [18]:
k = 11
total_possible_kmers = (4^k)/2
reached_sparsity = false
while !reached_sparsity
    @info "assessing k = $k for kmer sparsity"
    kmer_type = BioSequences.BigDNAMer{k}
    canonical_kmer_counts = Mycelia.count_canonical_kmers(kmer_type, fastq_file)

    reached_sparsity = 
        # we observed fewer than all possible kmers
        (length(keys(canonical_kmer_counts)) < total_possible_kmers) ||
        # we have singletons
        any(count -> count == 1, values(canonical_kmer_counts))
    if !reached_sparsity
        k = Primes.nextprime(k+1)
    else
        @info "selecting k = $k to start error correction procedure"
    end
end

┌ Info: assessing k = 11 for kmer sparsity
└ @ Main In[18]:5
┌ Info: selecting k = 11 to start error correction procedure
└ @ Main In[18]:17


# Round 1

In [19]:
kmer_type = BioSequences.BigDNAMer{k}

BioSequences.BigDNAMer{11} (alias for BioSequences.BigMer{BioSequences.DNAAlphabet{2}, 11})

In [20]:
kmer_graph = Mycelia.fastx_to_kmer_graph(kmer_type, fastq_file)

┌ Info: assessing kmers
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:1877
┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:1891


{120, 240} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [21]:
# visualize
gfa_file = fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(kmer_graph, gfa_file)

"/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.fastq.k-11.gfa"

In [22]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

Process(`[4m/Applications/Bandage.app/Contents/MacOS/Bandage[24m [4mimage[24m [4m/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.fastq.k-11.gfa[24m [4m/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.fastq.k-11.gfa.svg[24m [4m--depwidth[24m [4m1[24m [4m--deppower[24m [4m1[24m`, ProcessExited(0))

In [23]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

"./../../../../2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.fastq.k-11.gfa.svg"

In [24]:
x = display("text/html", "<img src=$(html_path_to_svg)>")

In [32]:
output_fastq_file = polish_fastq(kmer_graph, fastq_file)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:19[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


"/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.k11.fastq"

In [33]:
kmer_graph = Mycelia.fastx_to_kmer_graph(kmer_type, output_fastq_file)

┌ Info: assessing kmers
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:1877
┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:1891


{90, 178} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [34]:
# visualize
gfa_file = output_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(kmer_graph, gfa_file)

"/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.k11.fastq.k-11.gfa"

In [35]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

Process(`[4m/Applications/Bandage.app/Contents/MacOS/Bandage[24m [4mimage[24m [4m/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.k11.fastq.k-11.gfa[24m [4m/Users/cameronprybol/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.k11.fastq.k-11.gfa.svg[24m [4m--depwidth[24m [4m1[24m [4m--deppower[24m [4m1[24m`, ProcessExited(0))

In [36]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

"./../../../../2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full/2021-06-20-2021-06-20-100bp-10x-coverage-0.01-error-rate-full.k11.fastq.k-11.gfa.svg"

In [37]:
x = display("text/html", "<img src=$(html_path_to_svg)>")

# Round 2

In [None]:
k = Primes.nextprime(k+1)

In [None]:
kmer_type = BioSequences.BigDNAMer{k}

In [None]:
kmer_graph = Mycelia.fastx_to_kmer_graph(kmer_type, output_fastq_file)

In [None]:
# visualize
gfa_file = output_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(kmer_graph, gfa_file)

In [None]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

In [None]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

In [None]:
x = display("text/html", "<img src=$(html_path_to_svg)>")

In [None]:
output_fastq_file = polish_fastq(kmer_graph, output_fastq_file)

In [None]:
kmer_graph = Mycelia.fastx_to_kmer_graph(kmer_type, output_fastq_file)

In [None]:
# visualize
gfa_file = output_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(kmer_graph, gfa_file)

In [None]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

In [None]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

In [None]:
x = display("text/html", "<img src=$(html_path_to_svg)>")