In [1]:
DATE = "2021-06-25"
TASK = "simplified-error-correction"
DIR = "$(DATE)-$(TASK)"
DIR = mkpath("$(homedir())/$(DIR)")

"/Users/cameronprybol/2021-06-25-simplified-error-correction"

In [90]:
pkgs = [
"LightGraphs",
"MetaGraphs",
"BioSequences",
"uCSV",
"DataFrames",
"FASTX",
"Random",
"ProgressMeter",
"Revise",
"StatsBase"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia

In [3]:
# set a random seed
seed = Random.seed!(0)

MersenneTwister(0)

In [4]:
# randomly generate a dna sequence of 100bp
genome = BioSequences.randdnaseq(seed, 100)

100nt DNA Sequence:
AAGGGTGCGGTCTAGGTGCACTGCTTATGGTCCCCGACA…TCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT

In [5]:
# define error rate
error_rate = 0.01

0.01

In [6]:
# generate 100x coverage fastq file
# put accuracy rate into fastq file
coverage = 10
fastq_file = "$(DIR)/$(DATE)-$(TASK).fastq"
open(fastq_file, "w") do io
    fastq_writer = FASTX.FASTQ.Writer(io)
    for i in 1:coverage
        observed_sequence = Mycelia.observe(genome, error_rate=error_rate)
        q = -10 * log10(error_rate)
        quality_scores = fill(q, length(observed_sequence))
        fastq_record = FASTX.FASTQ.Record("i", observed_sequence, quality_scores)
        write(fastq_writer, fastq_record)
    end
end

In [7]:
k = 11

11

In [8]:
kmer_type = BioSequences.BigDNAMer{k}

BioSequences.BigDNAMer{11} (alias for BioSequences.BigMer{BioSequences.DNAAlphabet{2}, 11})

In [9]:
simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, fastq_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2957


{120, 240} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [10]:
# visualize
gfa_file = fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(simple_kmer_graph, gfa_file)

"/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa"

In [11]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

Process(`[4m/Applications/Bandage.app/Contents/MacOS/Bandage[24m [4mimage[24m [4m/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa[24m [4m/Users/cameronprybol/2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa.svg[24m [4m--depwidth[24m [4m1[24m [4m--deppower[24m [4m1[24m`, ProcessExited(0))

In [12]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

"./../../../../2021-06-25-simplified-error-correction/2021-06-25-simplified-error-correction.fastq.k-11.gfa.svg"

In [13]:
x = display("text/html", "<img src=$(html_path_to_svg)>")

In [21]:
solid_vertices = filter(v -> simple_kmer_graph.vprops[v][:weight] > 1, LightGraphs.vertices(simple_kmer_graph))

90-element Vector{Int64}:
   1
   2
   3
   4
   5
   6
   7
   8
  11
  12
  14
  16
  17
   ⋮
 105
 106
 108
 109
 110
 111
 113
 114
 115
 117
 118
 119

In [24]:
filtered_simple_kmer_graph, vertex_map = LightGraphs.induced_subgraph(simple_kmer_graph, solid_vertices)

({90, 178} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0), [1, 2, 3, 4, 5, 6, 7, 8, 11, 12  …  108, 109, 110, 111, 113, 114, 115, 117, 118, 119])

In [26]:
kmers = [filtered_simple_kmer_graph.vprops[v][:kmer] for v in LightGraphs.vertices(filtered_simple_kmer_graph)]

90-element Vector{BioSequences.BigDNAMer{11}}:
 AAACGTATTCT
 AAAGAGGTAGA
 AACAAAGAGGT
 AACGTATTCTT
 AAGAGAAGAAT
 AAGAGGTAGAT
 AAGCAGTGCAC
 AAGGGTGCGGT
 AATACGTTTCA
 AATCACCGCAC
 ACAAAGAGGTA
 ACAGGACCGTG
 ACCATAAGCAG
 ⋮
 GTGCACCTAGA
 GTGTTAAGAGA
 GTTGAAACGTA
 GTTTCAACAAA
 TAAGAGAAGAA
 TAAGCAGTGCA
 TAATCACCGCA
 TAGATCAGATA
 TCAGATAATCA
 TCTTCTCTTAA
 TCTTTGTTGAA
 TGTTAAGAGAA

In [14]:
fastx_records = collect(FASTX.FASTQ.Reader(open(fastq_file)))

10-element Vector{FASTX.FASTQ.Record}:
 FASTX.FASTQ.Record:
   identifier: i
  description: <missing>
     sequence: AAGGGTGCGGTCTAGGTGCACTGCTTATGGTCCCCGACAGGACCGTGCGGTGATTATCTGATCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT
      quality: UInt8[0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14  …  0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14]
 FASTX.FASTQ.Record:
   identifier: i
  description: <missing>
     sequence: AAGGGTGCGTCTAGGTGCACTGCTTATGGTCCCCGACAGGACCGTGCGGTGATTATCTGATCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT
      quality: UInt8[0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14  …  0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14]
 FASTX.FASTQ.Record:
   identifier: i
  description: <missing>
     sequence: AAGGGTGCGGTCTAGGTGCACTGCTTATGGTCCCCGACAGGACCGTGCGGTGATTATCTGATCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT
      quality: UInt8[0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14  …  0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,

In [41]:
record = fastx_records[5]

FASTX.FASTQ.Record:
   identifier: i
  description: <missing>
     sequence: AGGTGTTAAGAGAAGAATACGTTTCAACAAAGAGGTAGATCAGATAATCACCGCGCGGTCCTGTCGGGGACCATAAGCAGTGCACCTAGACCGCACCCTT
      quality: UInt8[0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14  …  0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14]

In [94]:
kmer_iterator = collect(BioSequences.each(kmer_type, FASTX.sequence(record)))
bubble_start = 0

# iterate through the read, asking whether the

updated_path = Vector{Pair{Int, Bool}}()

for (i, kmer) in enumerate(kmer_iterator)
    canonical_kmer = min(kmer.fw, kmer.bw)
    orientation = canonical_kmer == kmer.fw
    kmer_index_range = searchsorted(kmers, canonical_kmer)
    kmer_is_solid = !isempty(kmer_index_range)
    @show kmer_is_solid
    if kmer_is_solid
        kmer_index = first(kmer_index_range)
    else
        kmer_index = 0
    end
    @show kmer_index
    
    bubble_has_started = bubble_start > 0
    
    if kmer_is_solid
        if !bubble_has_started
            @show "solid stretch"
            push!(updated_path, kmer_index => orientation)
        else
            @show "end of bubble!!"
            if bubble_start > 1
                @show "check for shortest paths"
                
                source_vertex, source_orientation = last(updated_path)
                destination_vertex, destination_orientation = kmer_index, orientation
                
                @show source_vertex, source_orientation
                @show destination_vertex, destination_orientation
                
                shortest_paths = LightGraphs.yen_k_shortest_paths(
                    filtered_simple_kmer_graph,
                    source_vertex,
                    destination_vertex,
                    LightGraphs.weights(filtered_simple_kmer_graph),
                    3).paths
                
                if isempty(shortest_paths)
                    error("no valid alternate paths found")
                end
                candidate_path_probabilities = ones(length(shortest_paths))
                oriented_candidate_paths = [
                    [last(updated_path)] for i in 1:length(shortest_paths)
                ]

                for (i, candidate_path) in enumerate(shortest_paths)
                    for dest_vertex in candidate_path[2:end]
                        source_vertex, source_orientation = last(oriented_candidate_paths[i])
                        candidate_path_probabilities[i] *= transition_probabilities[source_orientation][source_vertex, dest_vertex]
                        candidate_path_probabilities[i] *= state_likelihoods[dest_vertex]
                        if candidate_path_probabilities[i] > 0
                            edge = LightGraphs.Edge(source_vertex, dest_vertex)
                            destination_orientation = 
                            first(
                                filter(o -> o.source_orientation == source_orientation,
                                    filtered_simple_kmer_graph.eprops[edge][:orientations])).destination_orientation
                            push!(oriented_candidate_paths[i], (dest_vertex => destination_orientation))
                        else
                            break # this path is no good, evaluate the next
                        end
                    end
                end

                if !any(p -> p > 0, candidate_path_probabilities)
                    error("no valid alternate path probabilities")
                end
                chosen_replacement = StatsBase.sample(oriented_candidate_paths, StatsBase.weights(candidate_path_probabilities))
                
                for i in 2:length(chosen_replacement)
                    oriented_state = chosen_replacement[i]
                    push!(updated_path, oriented_state)
                end
            else
                @show "beginning of read is loose, clipping"
            end
            bubble_start = 0
        end
    else
        if !bubble_has_started
            @show "beginning of bubble!"
            bubble_start = i
        else
            @show "continuation of bubble!"
        end
    end
end

kmer_is_solid = true
kmer_index = 32
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 61
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 80
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 90
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 65
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 88
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 83
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 5
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 23
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 69
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 21
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 18
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 4
"solid stretch" = "solid stretch"
kmer_is_solid = true
kmer_index = 1
"solid stretch" = "solid stretch"
kmer_is_s

In [74]:
transition_probabilities = Mycelia.initialize_transition_probabilities(filtered_simple_kmer_graph)

Dict{Bool, SparseArrays.SparseMatrixCSC{Float64, Int64}} with 2 entries:
  0 => …
  1 => …

In [77]:
state_likelihoods = [Float64(filtered_simple_kmer_graph.vprops[v][:weight]) for v in LightGraphs.vertices(filtered_simple_kmer_graph)]
state_likelihoods ./= sum(state_likelihoods)

90-element Vector{Float64}:
 0.011507479861910242
 0.011507479861910242
 0.011507479861910242
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 ⋮
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 0.011507479861910242
 0.010356731875719217
 0.011507479861910242
 0.010356731875719217

In [95]:
Mycelia.oriented_path_to_sequence(filtered_simple_kmer_graph, updated_path)

100nt DNA Sequence:
AGGTGTTAAGAGAAGAATACGTTTCAACAAAGAGGTAGA…TGTCGGGGACCATAAGCAGTGCACCTAGACCGCACCCTT

In [None]:
output_fastq_file = Mycelia.polish_fastq(simple_kmer_graph, fastq_file)

In [None]:
simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, output_fastq_file)

In [None]:
# visualize
gfa_file = output_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(simple_kmer_graph, gfa_file)

In [None]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

In [None]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

In [None]:
x = display("text/html", "<img src=$(html_path_to_svg)>")