In [1]:
DATE = "2021-09-05"
TASK = "annealing-correction"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

"/home/jovyan/2021-09-05-annealing-correction"

In [2]:
import Pkg
pkgs = [
    "BioAlignments",
    "BioSequences",
    "Clustering",
#     "CodecZlib",
#     "Colors",
#     "Combinatorics",
    "DataFrames",
    "DataStructures",
    "Dates",
#     "DelimitedFiles",
    "Distances",
#     "Distributions",
#     "EzXML",
    "FASTX",
#     "GFF3",
    "GraphPlot",
#     "HTTP",
#     "Impute",
#     "JSON",
    "Graphs",
#     "LSHFunctions",
#     "Measures",
    "MetaGraphs",
    "https://github.com/cjprybol/Mycelia.git",
#     "NumericIO",
#     "PlotlyJS",
#     "Plots",
    "Primes",
#     "Printf",
    "ProgressMeter",
    "Random",
    "Revise",
    "SparseArrays",
    "Statistics",
    "StatsBase",
    "StatsPlots",
#     "StringDistances",
    "uCSV",
#     "XLSX",
]

unregistered_packages = filter(pkg -> occursin(r"(^https|git$)", pkg), pkgs)
registered_packages = setdiff(pkgs, unregistered_packages)

for pkg in registered_packages
    try
        eval(Meta.parse("import $(pkg)"))
    catch
        Pkg.add(pkg)
        Pkg.build(pkg)
        eval(Meta.parse("import $(pkg)"))
    end
end

for pkg_url in unregistered_packages
    pkg_name = replace(basename(pkg_url), ".git" => "")
    try
        eval(Meta.parse("import $(pkg_name)"))
    catch
        Pkg.develop(url=pkg_url)
        Pkg.build(pkg_name)
        eval(Meta.parse("import $(pkg_name)"))
    end
end

In [3]:
function kmer_index_and_orientation_to_kmer(graph, kmer_index, orientation)
    kmer = graph.vprops[kmer_index][:kmer]
    if !orientation
        kmer = BioSequences.reverse_complement(kmer)
    end
    return kmer
end

kmer_index_and_orientation_to_kmer (generic function with 1 method)

In [4]:
function determine_edge_probabilities(graph)
    nv = Graphs.nv(graph)
    edge_probabilities = SparseArrays.spzeros(nv, nv)
    for v in 1:nv
        neighbors = Graphs.neighbors(graph, v)
        @assert issorted(neighbors)
        likelihoods = zeros(length(neighbors))
        for (i, neighbor) in enumerate(neighbors)
            if v <= neighbor
                edge = Graphs.Edge(v, neighbor)
            else
                edge = Graphs.Edge(neighbor, v)
            end
            @assert Graphs.has_edge(graph, edge)
            likelihoods[i] = graph.eprops[edge][:count]     
        end
        likelihoods = likelihoods ./ sum(likelihoods)
#         @show likelihoods
#         @show neighbors
        for (neighbor, likelihood) in zip(neighbors, likelihoods)
            edge_probabilities[v, neighbor] = likelihood
        end
    end
    
    for source in 1:size(edge_probabilities, 1)
        destinations = findall(edge_probabilities[source, :] .> 0)
        destination_counts = [graph.vprops[dest][:count] for dest in destinations]
        destination_likelihoods = destination_counts ./ sum(destination_counts)
        for (dest, likelihood) in zip(destinations, destination_likelihoods)
            edge_probabilities[source, dest] *= likelihood
        end
        edge_probabilities[source, :] ./= sum(edge_probabilities[source, :])
        @assert abs(1-sum(edge_probabilities[source, :])) <= eps(Float64)
    end
    
    return edge_probabilities
end

determine_edge_probabilities (generic function with 1 method)

In [5]:
function take_a_walk(graph, edge_likelihoods, kmers, kmer, walk_length)
    walk = [kmer]
    current_kmer_indices = searchsorted(kmers, BioSequences.canonical(last(walk)))
    @assert length(current_kmer_indices) == 1
    current_kmer_index = first(current_kmer_indices)
    while length(walk) < walk_length
        oriented_neighbors = Tuple{Int, Bool}[]
        for neighbor in BioSequences.neighbors(kmer)
            canonical_neighbor = BioSequences.canonical(neighbor)
            neighbor_is_canonical = neighbor == canonical_neighbor
            neighbor_vertex_range = searchsorted(kmers, canonical_neighbor)
            if !isempty(neighbor_vertex_range)
                neighbor_vertex = first(neighbor_vertex_range)
            else
                neighbor_vertex = 0
            end
            push!(oriented_neighbors, (neighbor_vertex, neighbor_is_canonical))
        end
        viable_neighbors = filter(neighbor -> first(neighbor) > 0, oriented_neighbors)
        if length(viable_neighbors) == 1
            chosen_neighbor = first(viable_neighbors)
        else
            viable_neighbor_indices = first.(viable_neighbors)
            step_likelihoods = StatsBase.weights(edge_likelihoods[current_kmer_index, viable_neighbor_indices])
            chosen_neighbor = StatsBase.sample(viable_neighbors, step_likelihoods)
        end
        kmer = kmer_index_and_orientation_to_kmer(graph, chosen_neighbor...)
        push!(walk, kmer)
        current_kmer_indices = searchsorted(kmers, BioSequences.canonical(last(walk)))
        @assert length(current_kmer_indices) == 1
        current_kmer_index = first(current_kmer_indices)
    end
    return walk
end

take_a_walk (generic function with 1 method)

In [6]:
# create a random genome

In [7]:
L = 10

10

In [8]:
genome = BioSequences.randdnaseq(Random.seed!(L), L)

10nt DNA Sequence:
AGCCTGCAAA

In [9]:
# determine the diminishing return point for kmers

In [10]:
k = 3

3

In [11]:
kmer_counts = sort!(Mycelia.count_canonical_kmers(BioSequences.BigDNAMer{k}, genome))

OrderedCollections.OrderedDict{BioSequences.BigDNAMer{3}, Int64} with 7 entries:
  AAA => 1
  AGC => 1
  AGG => 1
  CAA => 1
  CAG => 1
  GCA => 2
  GCC => 1

In [12]:
K = length(keys(kmer_counts))

7

In [13]:
# create an undirected kmer graph from the sequence

In [14]:
graph = MetaGraphs.MetaGraph(K)

{7, 0} undirected Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [15]:
for (i, (kmer, count)) in enumerate(kmer_counts)
    @show i, kmer, count
    @show MetaGraphs.set_prop!(graph, i, :kmer, kmer)
    @show MetaGraphs.set_prop!(graph, i, :count, count)
end

(i, kmer, count) = (1, AAA, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (2, AGC, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (3, AGG, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (4, CAA, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (5, CAG, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (6, GCA, 2)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true
(i, kmer, count) = (7, GCC, 1)
MetaGraphs.set_prop!(graph, i, :kmer, kmer) = true
MetaGraphs.set_prop!(graph, i, :count, count) = true


In [16]:
graph.vprops

Dict{Int64, Dict{Symbol, Any}} with 7 entries:
  5 => Dict(:count=>1, :kmer=>CAG)
  4 => Dict(:count=>1, :kmer=>CAA)
  6 => Dict(:count=>2, :kmer=>GCA)
  7 => Dict(:count=>1, :kmer=>GCC)
  2 => Dict(:count=>1, :kmer=>AGC)
  3 => Dict(:count=>1, :kmer=>AGG)
  1 => Dict(:count=>1, :kmer=>AAA)

In [17]:
kmers = collect(keys(kmer_counts))

7-element Vector{BioSequences.BigDNAMer{3}}:
 AAA
 AGC
 AGG
 CAA
 CAG
 GCA
 GCC

In [18]:
for i in 1:length(genome)-k
    edge_range = i:i+k
    edge = genome[edge_range]
    src = BioSequences.BigDNAMer{k}(edge[1:end-1])
    dst = BioSequences.BigDNAMer{k}(edge[2:end])
    canonical_src = BioSequences.canonical(src)
    canonical_dst = BioSequences.canonical(dst)
    src_index = Mycelia.get_kmer_index(kmers, canonical_src)
    dst_index = Mycelia.get_kmer_index(kmers, canonical_dst)
    @show edge
    @show src_index, src == canonical_src, dst_index, dst == canonical_dst
    graph_edge = Graphs.Edge(src_index, dst_index)
    if Graphs.has_edge(graph, graph_edge)
        current_count = graph.eprops[graph_edge][:count]
        MetaGraphs.set_prop!(graph, graph_edge, :count, current_count+1)
    else
        Graphs.add_edge!(graph, graph_edge)
        MetaGraphs.set_prop!(graph, graph_edge, :count, 1)
    end
end

edge = AGCC
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (2, true, 7, true)
edge = GCCT
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (7, true, 3, false)
edge = CCTG
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (3, false, 5, false)
edge = CTGC
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (5, false, 6, false)
edge = TGCA
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (6, false, 6, true)
edge = GCAA
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (6, true, 4, true)
edge = CAAA
(src_index, src == canonical_src, dst_index, dst == canonical_dst) = (4, true, 1, true)


In [19]:
graph

{7, 7} undirected Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [20]:
graph.eprops

Dict{Graphs.SimpleGraphs.SimpleEdge{Int64}, Dict{Symbol, Any}} with 7 entries:
  Edge 3 => 7 => Dict(:count=>1)
  Edge 5 => 6 => Dict(:count=>1)
  Edge 2 => 7 => Dict(:count=>1)
  Edge 6 => 6 => Dict(:count=>1)
  Edge 3 => 5 => Dict(:count=>1)
  Edge 4 => 6 => Dict(:count=>1)
  Edge 1 => 4 => Dict(:count=>1)

In [21]:
# GraphPlot.gplot(graph, nodesize=[rand(1:2) for i in 1:Graphs.nv(graph)])

In [22]:
edge_likelihoods = determine_edge_probabilities(graph)

7×7 SparseArrays.SparseMatrixCSC{Float64, Int64} with 13 stored entries:
  ⋅         ⋅    ⋅        1.0    ⋅     ⋅         ⋅ 
  ⋅         ⋅    ⋅         ⋅     ⋅     ⋅        1.0
  ⋅         ⋅    ⋅         ⋅    0.5    ⋅        0.5
 0.333333   ⋅    ⋅         ⋅     ⋅    0.666667   ⋅ 
  ⋅         ⋅   0.333333   ⋅     ⋅    0.666667   ⋅ 
  ⋅         ⋅    ⋅        0.25  0.25  0.5        ⋅ 
  ⋅        0.5  0.5        ⋅     ⋅     ⋅         ⋅ 

In [23]:
# take yen_k shortest paths

In [52]:
genome_kmers = collect(BioSequences.each(BioSequences.BigDNAMer{k}, genome))

8-element Vector{BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}}:
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(1, AGC, GCT)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(2, GCC, GGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(3, CCT, AGG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(4, CTG, CAG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(5, TGC, GCA)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(6, GCA, TGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(7, CAA, TTG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(8, AAA, TTT)

In [24]:
# make a SNP edit to the genome

In [78]:
seed = 0

0

In [79]:
seed += 1
@show seed
Random.seed!(seed)
observation = Mycelia.observe(genome, error_rate = 0.1)

seed = 1


11nt DNA Sequence:
AGCGCTGCAAA

In [80]:
alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), observation, genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 1
  seq:  1 AGCGCTGCAAA 11
          ||| |||||||
  ref:  1 AGC-CTGCAAA 10


In [81]:
reverse_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), BioSequences.reverse_complement(observation), genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 8
  seq:  1 TTTGCAGCGCT 11
              | ||   
  ref:  0 -AGCCTGCAAA 10


In [82]:
if reverse_alignment.value < alignment.value
    observation = BioSequences.reverse_complement!(observation)
    alignment = reverse_alignment
    @show "flipping"
end

In [83]:
# convert genome into stranded path

In [84]:
# function sequence_to_
observation_as_oriented_kmers = []
observation_kmers = collect(BioSequences.each(BioSequences.BigDNAMer{k}, observation))

9-element Vector{BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}}:
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(1, AGC, GCT)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(2, GCG, CGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(3, CGC, GCG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(4, GCT, AGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(5, CTG, CAG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(6, TGC, GCA)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(7, GCA, TGC)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(8, CAA, TTG)
 BioSequences.MerIterResult{BioSequences.BigDNAMer{3}}(9, AAA, TTT)

In [85]:
is_canonical = falses(length(observation_kmers))
kmer_index = zeros(Int, length(observation_kmers))
for (i, kmer) in enumerate(observation_kmers)
#     is_canonical[i] = kmer.fw <= kmer.bw
    canonical_kmer = BioSequences.canonical(kmer.fw)
    is_canonical[i] = kmer.fw == canonical_kmer
    kmer_index_range = searchsorted(kmers, canonical_kmer)
    if length(kmer_index_range) > 1
        @error "bad"
    elseif isempty(kmer_index_range)
        # do nothing, index is 0 to indicate not found
    else
        kmer_index[i] = first(kmer_index_range)
    end
end

In [86]:
is_canonical

9-element BitVector:
 1
 0
 1
 0
 0
 0
 1
 1
 1

In [87]:
kmer_index

9-element Vector{Int64}:
 2
 0
 0
 2
 5
 6
 6
 4
 1

In [88]:
proposed_path = BioSequences.BigDNAMer{k}[]

BioSequences.BigDNAMer{3}[]

In [89]:
# at each break in the path, take last good bits on each edge

In [90]:
initial_missing_kmer_path_index = findfirst(x -> x == 0, kmer_index)

2

In [91]:
opening_solid_kmer_path_index = initial_missing_kmer_path_index - 1

1

In [92]:
if initial_missing_kmer_path_index == nothing
    @show "done"
elseif initial_missing_kmer_path_index == 1
    @show "missing opening"
    opening_solid_kmer = nothing
else
    opening_solid_kmer = graph.vprops[kmer_index[opening_solid_kmer_path_index]][:kmer]
    if !is_canonical[opening_solid_kmer_path_index]
        opening_solid_kmer = BioSequences.reverse_complement(opening_solid_kmer)
    end
    opening_solid_kmer
end

DNA 3-mer:
AGC

In [93]:
closing_solid_kmer_path_index = findnext(map(x -> x != 0, kmer_index), opening_solid_kmer_path_index+1)

4

In [94]:
if closing_solid_kmer_path_index == nothing
    @show "end is open"
    closing_solid_kmer = nothing
else
    closing_solid_kmer = graph.vprops[kmer_index[closing_solid_kmer_path_index]][:kmer]
    @show closing_solid_kmer
    if !is_canonical[closing_solid_kmer_path_index]
        closing_solid_kmer = BioSequences.reverse_complement(closing_solid_kmer)
    end
    closing_solid_kmer
end

closing_solid_kmer = AGC


DNA 3-mer:
GCT

In [95]:
# if end is open, need to take a random walk of N steps

In [96]:
if (opening_solid_kmer == nothing) && (closing_solid_kmer != nothing)
    @show "missing opening"
    inverted_closing_solid_kmer = BioSequences.reverse_complement(closing_solid_kmer)
    @show inverted_closing_solid_kmer
    walk_length = closing_solid_kmer_path_index
    chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, inverted_closing_solid_kmer, walk_length)
    chosen_walk = reverse(BioSequences.reverse_complement.(chosen_walk))
#     chosen_walk = reverse(chosen_walk)
    @show chosen_walk
    @assert observation_kmers[closing_solid_kmer_path_index].fw == last(chosen_walk)
    proposed_path = vcat(chosen_walk, map(x -> x.fw, observation_kmers[closing_solid_kmer_path_index+1:end]))
end

In [97]:
if (opening_solid_kmer != nothing) && (closing_solid_kmer == nothing)
    @show "missing closing"
    walk_length = length(observation_kmers) - opening_solid_kmer_path_index + 1
    chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length)
    @show chosen_walk
#     chosen_walk = reverse(BioSequences.reverse_complement.(chosen_walk))
    @assert observation_kmers[opening_solid_kmer_path_index].fw == first(chosen_walk)
    proposed_path = vcat(map(x -> x.fw, observation_kmers[1:opening_solid_kmer_path_index]), chosen_walk[2:end])
end

In [113]:
if (opening_solid_kmer != nothing) && (closing_solid_kmer != nothing)
    @show "bubble!"
#     @show opening_solid_kmer_path_index
#     @show closing_solid_kmer_path_index
    # overshoot a bit and trim
    walk_length = ceil((closing_solid_kmer_path_index - opening_solid_kmer_path_index + 1) * 1.1)
    opening_solid_kmer_index = kmer_index[opening_solid_kmer_path_index]
    closing_solid_kmer_index = kmer_index[closing_solid_kmer_path_index]
#     @show closing_solid_kmer
    @show chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length)
#     @show observation_kmers[closing_solid_kmer_path_index:end]
    @show last_overlap = findfirst(kmer -> kmer.fw == last(chosen_walk), observation_kmers[closing_solid_kmer_path_index:end])
    @assert last_overlap != nothing
    @show remaining_kmers = getproperty.(observation_kmers[closing_solid_kmer_path_index + last_overlap:end], :fw)
    proposed_path = vcat(chosen_walk, remaining_kmers)
end

"bubble!" = "bubble!"
chosen_walk = take_a_walk(graph, edge_likelihoods, kmers, opening_solid_kmer, walk_length) = BioSequences.BigDNAMer{3}[AGC, GCC, CCT, CTG, TGC]
last_overlap = findfirst((kmer->begin
                #= In[113]:12 =#
                kmer.fw == last(chosen_walk)
            end), observation_kmers[closing_solid_kmer_path_index:end]) = 3
remaining_kmers = getproperty.(observation_kmers[closing_solid_kmer_path_index + last_overlap:end], :fw) = BioSequences.BigDNAMer{3}[GCA, CAA, AAA]


8-element Vector{BioSequences.BigDNAMer{3}}:
 AGC
 GCC
 CCT
 CTG
 TGC
 GCA
 CAA
 AAA

In [114]:
proposed_seq = BioSequences.LongDNASeq(first(proposed_path))

3nt DNA Sequence:
AGC

In [115]:
for kmer in proposed_path[2:end]
    push!(proposed_seq, last(kmer))
end
proposed_seq

10nt DNA Sequence:
AGCCTGCAAA

In [116]:
new_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), proposed_seq, genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 0
  seq:  1 AGCCTGCAAA 10
          ||||||||||
  ref:  1 AGCCTGCAAA 10


In [117]:
reverse_new_alignment = BioAlignments.pairalign(BioAlignments.LevenshteinDistance(), BioSequences.reverse_complement(proposed_seq), genome)

BioAlignments.PairwiseAlignmentResult{Int64, BioSequences.LongDNASeq, BioSequences.LongDNASeq}:
  distance: 8
  seq:  0 --TTTGCAGGCT 10
              ||||    
  ref:  1 AGCCTGCA--AA 10


In [118]:
if reverse_new_alignment.value < new_alignment.value
#     observation = BioSequences.reverse_complement!(observation)
    new_alignment = reverse_new_alignment
    @show "flipping"
end

In [119]:
@assert new_alignment.value <= alignment.value