In [1]:
DATE = "2021-06-25"
TASK = "simplified-error-correction"
DIR = "$(DATE)-$(TASK)"
DIR = mkpath("$(homedir())/$(DIR)")

"/Users/cameronprybol/2021-06-25-simplified-error-correction"

In [42]:
pkgs = [
"Graphs",
"MetaGraphs",
"BioSequences",
"uCSV",
"DataFrames",
"FASTX",
"Random",
"ProgressMeter",
"Revise",
"StatsBase",
"BioAlignments",
"Statistics"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia

In [3]:
# set a random seed
seed = Random.seed!(0)

MersenneTwister(0)

In [4]:
# randomly generate a dna sequence of 100bp
genome = BioSequences.randdnaseq(seed, 100)

100nt DNA Sequence:
AAGGGTGCGGTCTAGGTGCACTGCTTATGGTCCCCGACA…TCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT

In [5]:
# define error rate
error_rate = 0.01

0.01

In [6]:
# generate 100x coverage fastq file
# put accuracy rate into fastq file
coverage = 10
fastq_file = "$(DIR)/$(DATE)-$(TASK).fastq"
open(fastq_file, "w") do io
    fastq_writer = FASTX.FASTQ.Writer(io)
    for i in 1:coverage
        observed_sequence = Mycelia.observe(genome, error_rate=error_rate)
        q = -10 * log10(error_rate)
        quality_scores = fill(q, length(observed_sequence))
        fastq_record = FASTX.FASTQ.Record("i", observed_sequence, quality_scores)
        write(fastq_writer, fastq_record)
    end
end

In [7]:
k = 11

11

In [8]:
kmer_type = BioSequences.BigDNAMer{k}

BioSequences.BigDNAMer{11} (alias for BioSequences.BigMer{BioSequences.DNAAlphabet{2}, 11})

In [9]:
simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, fastq_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2957


{120, 240} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [10]:
# visualize
gfa_file = fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(simple_kmer_graph, gfa_file)

"/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa"

In [11]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

Process(`[4m/Applications/Bandage.app/Contents/MacOS/Bandage[24m [4mimage[24m [4m/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa[24m [4m/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa.svg[24m [4m--depwidth[24m [4m1[24m [4m--deppower[24m [4m1[24m`, ProcessExited(0))

In [12]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

"./../../../../2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa.svg"

In [13]:
x = display("text/html", "<img src=$(html_path_to_svg)>")

In [43]:
min_depth = 3
solid_vertices = filter(v -> simple_kmer_graph.vprops[v][:weight] >= min_depth, Graphs.vertices(simple_kmer_graph))
filtered_simple_kmer_graph, vertex_map = Graphs.induced_subgraph(simple_kmer_graph, solid_vertices)
kmers = [filtered_simple_kmer_graph.vprops[v][:kmer] for v in Graphs.vertices(filtered_simple_kmer_graph)]
k = filtered_simple_kmer_graph.gprops[:k]

polished_fastq_file = replace(fastq_file, ".fastq" => ".k$k.d$(min_depth).fastq")

transition_probabilities = Mycelia.initialize_transition_probabilities(filtered_simple_kmer_graph)
state_likelihoods = [Float64(filtered_simple_kmer_graph.vprops[v][:weight]) for v in Graphs.vertices(filtered_simple_kmer_graph)]
state_likelihoods ./= sum(state_likelihoods)

fastq_reader = FASTX.FASTQ.Reader(open(fastq_file))


fastq_writer = FASTX.FASTQ.Writer(open(polished_fastq_file, "w"))

for fastx_record in fastq_reader
    bubble_start = 0
    updated_path = Vector{Pair{Int, Bool}}()
#     @show FASTX.sequence(fastx_record)
    for (i, kmer) in enumerate(BioSequences.each(kmer_type, FASTX.sequence(fastx_record)))
        canonical_kmer = min(kmer.fw, kmer.bw)
        orientation = canonical_kmer == kmer.fw
        kmer_index_range = searchsorted(kmers, canonical_kmer)
        kmer_is_solid = !isempty(kmer_index_range)
#         @show kmer_is_solid
        if kmer_is_solid
            kmer_index = first(kmer_index_range)
        else
            kmer_index = 0
        end

        in_bubble = bubble_start > 0

        if !kmer_is_solid
            if !in_bubble
#                 @show "starting a bubble"
                bubble_start = i
            else
#                 @show "continuing in a bubble"
            end
        else
            if !in_bubble
#                 @show "pushing solid kmer to updated path"
                push!(updated_path, kmer_index => orientation)
            else
                if bubble_start == 1
#                     @show "ending an opening bubble"
                    # we're in a bubble that began at the beginning of the read
                    # we'll do nothing and just remove this
                    # equivalent to tip clipping
#                     @show "pushing solid kmer to updated path"
                    push!(updated_path, kmer_index => orientation)
                    bubble_start = 0
                else
#                     @show "found end of an anchored bubble -- correcting"
                    source_vertex, source_orientation = last(updated_path)
                    destination_vertex, destination_orientation = kmer_index, orientation                

                    shortest_paths = Graphs.yen_k_shortest_paths(
                        filtered_simple_kmer_graph,
                        source_vertex,
                        destination_vertex,
                        Graphs.weights(filtered_simple_kmer_graph),
                        3).paths

                    if isempty(shortest_paths)
                        error("no valid alternate paths found")
                    end
                    candidate_path_probabilities = ones(length(shortest_paths))
                    oriented_candidate_paths = [
                        [last(updated_path)] for i in 1:length(shortest_paths)
                    ]

                    for (i, candidate_path) in enumerate(shortest_paths)
                        for dest_vertex in candidate_path[2:end]
                            source_vertex, source_orientation = last(oriented_candidate_paths[i])
                            candidate_path_probabilities[i] *= transition_probabilities[source_orientation][source_vertex, dest_vertex]
                            candidate_path_probabilities[i] *= state_likelihoods[dest_vertex]
                            if candidate_path_probabilities[i] > 0
                                edge = Graphs.Edge(source_vertex, dest_vertex)
                                destination_orientation = 
                                first(
                                    filter(o -> o.source_orientation == source_orientation,
                                        filtered_simple_kmer_graph.eprops[edge][:orientations])).destination_orientation
                                push!(oriented_candidate_paths[i], (dest_vertex => destination_orientation))
                            else
                                break # this path is no good, evaluate the next
                            end
                        end
                    end
                    non_zero_indices = findall(p -> p > 0, candidate_path_probabilities)
                    if isempty(non_zero_indices)
                        error("no valid alternate path probabilities")
                    end
                    
                    candidate_path_probabilities = candidate_path_probabilities[non_zero_indices]
                    oriented_candidate_paths = oriented_candidate_paths[non_zero_indices]

                    # offset is for debugging
                    # make sure that anchors on both sides are the same
                    offset = 0
                    observed_sequence = FASTX.sequence(fastx_record)[bubble_start+k-1-offset:i-1+offset]                    
                    for (i, oriented_candidate_path) in enumerate(oriented_candidate_paths)
                        candidate_sequence = Mycelia.oriented_path_to_sequence(
                            filtered_simple_kmer_graph, 
                            oriented_candidate_path)
                        candidate_sequence = candidate_sequence[k+1-offset:end-k+offset]
                        alignment_result = BioAlignments.pairalign(
                            BioAlignments.LevenshteinDistance(),
                            candidate_sequence,
                            observed_sequence)
                        @show alignment_result
#                         @show alignment_result
                        average_error_rate = Statistics.mean(Mycelia.q_value_to_error_rate.(FASTX.quality(fastx_record)))
                        for error in 1:alignment_result.value
                            candidate_path_probabilities[i] *= average_error_rate
                        end
                        for match in 1:BioAlignments.count_matches(alignment_result.aln)
                            candidate_path_probabilities[i] *= (1 - average_error_rate)
                        end
                    end
                    
                    chosen_replacement = StatsBase.sample(oriented_candidate_paths, StatsBase.weights(candidate_path_probabilities))
                    
                    for i in 2:length(chosen_replacement)
                        oriented_state = chosen_replacement[i]
                        push!(updated_path, oriented_state)
                    end
                    bubble_start = 0
                end
            end
        end
    end
#     @show updated_path
    sequence = Mycelia.oriented_path_to_sequence(filtered_simple_kmer_graph, updated_path)
    alignment_result = BioAlignments.pairalign(
        BioAlignments.LevenshteinDistance(),
        sequence,
        FASTX.sequence(fastx_record))
    if alignment_result.value > 0
        @show alignment_result
    end
    quality = StatsBase.sample(FASTX.quality(fastx_record), length(sequence), replace=true, ordered=true)
    description =  join(filter(!isempty, (FASTX.description(fastx_record), "k$k.d$(min_depth)")), '.')
    identifier = FASTX.identifier(fastx_record)
    new_record = FASTX.FASTQ.Record(identifier, description, sequence, quality)
    write(fastq_writer, new_record)
end
close(fastq_reader)
close(fastq_writer)

alignment_result = BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 8
  seq:  0 --------GTCTAGGTGCACTGCTTATGGTCCCCGACAGGACCGTGCGGTGATTATCTGA 52
                  ||||||||||||||||||||||||||||||||||||||||||||||||||||
  ref:  1 AAGGGTGCGTCTAGGTGCACTGCTTATGGTCCCCGACAGGACCGTGCGGTGATTATCTGA 60

  seq: 53 TCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT 91
          |||||||||||||||||||||||||||||||||||||||
  ref: 61 TCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT 99

alignment_result = BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 1
  seq: 1 A 1
          
  ref: 1 G 1

alignment_result = BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 1
  seq:   1 AGGTGTTAAGAGAAGAATACGTTTCAACAAAGAGGTAGATCAGATAATCACCGCACGGTC  60
           |||||||||||||||||||||||||||||||||||||||||||||||||||||| |||||
  ref:   1 AGGTGTTAAGAGAAGAATACGTTTCAACAAAGAGGTAGA

In [45]:
simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, polished_fastq_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2957


{90, 178} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [46]:
# visualize
gfa_file = polished_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(simple_kmer_graph, gfa_file)

"/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.k11.d3.fastq.k-11.gfa"

In [47]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

Process(`[4m/Applications/Bandage.app/Contents/MacOS/Bandage[24m [4mimage[24m [4m/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.k11.d3.fastq.k-11.gfa[24m [4m/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.k11.d3.fastq.k-11.gfa.svg[24m [4m--depwidth[24m [4m1[24m [4m--deppower[24m [4m1[24m`, ProcessExited(0))

In [48]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

"./../../../../2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.k11.d3.fastq.k-11.gfa.svg"

In [49]:
x = display("text/html", "<img src=$(html_path_to_svg)>")