In [1]:
DATE = "2021-06-26"
TASK = "assess-reconstruction-accuracy"
DIR = "$(DATE)-$(TASK)"
DIR = mkpath("$(homedir())/$(DIR)")

"/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy"

In [57]:
pkgs = [
"Graphs",
"MetaGraphs",
"BioSequences",
"uCSV",
"DataFrames",
"FASTX",
"Random",
"ProgressMeter",
"Revise",
"StatsBase",
"BioAlignments",
"Statistics",
"Distances",
"LSHFunctions"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia

┌ Info: Precompiling LSHFunctions [5134c85a-a9db-11e9-340f-8514dff59a31]
└ @ Base loading.jl:1317
[33m[1m│ [22m[39mThis may mean Distributions [31c24e10-a181-5473-b8eb-7969acd0382f] does not support precompilation but is imported by a module that does.
[33m[1m└ [22m[39m[90m@ Base loading.jl:1008[39m
┌ Info: Skipping precompilation since __precompile__(false). Importing LSHFunctions [5134c85a-a9db-11e9-340f-8514dff59a31].
└ @ Base loading.jl:1025


In [72]:
function graph_to_kmers(g)
    kmers = [g.vprops[v][:kmer] for v in Graphs.vertices(g)]
    return kmers
end

graph_to_kmers (generic function with 1 method)

In [75]:
function graph_to_edge_sequences(g)
    edges = Set{BioSequences.BigDNAMer{g.gprops[:k]+1}}()
    for edge in Graphs.edges(g)
        src_kmer = g.vprops[edge.src][:kmer]
        dst_kmer = g.vprops[edge.dst][:kmer]
        for orientation in g.eprops[edge][:orientations]
            if orientation.source_orientation
                oriented_src_kmer = src_kmer
            else
                oriented_src_kmer = BioSequences.reverse_complement(src_kmer)
            end
            if orientation.destination_orientation
                oriented_dst_kmer = dst_kmer
            else
                oriented_dst_kmer = BioSequences.reverse_complement(dst_kmer)
            end
            for i in 1:g.gprops[:k]-1
                @assert oriented_src_kmer[i+1] == oriented_dst_kmer[i]
            end
            edge_mer = BioSequences.BigDNAMer((nuc for nuc in oriented_src_kmer)..., last(oriented_dst_kmer))
            push!(edges, BioSequences.canonical(edge_mer))
        end
    end
    return edges
end

graph_to_edge_sequences (generic function with 1 method)

In [73]:
function kmer_graph_distances(g1, g2)
    g1_kmers = Set(graph_to_kmers(g1))
    g1_edges = graph_to_edge_sequences(g1)
    
    g2_kmers = Set(graph_to_kmers(g2))
    g2_edges = graph_to_edge_sequences(g2)
    
    kmer_distance = 1 - LSHFunctions.jaccard(g1_kmers, g2_kmers)
    edge_distance = 1 - LSHFunctions.jaccard(g1_edges, g2_edges)
    
    result = (
        kmer_distance = kmer_distance,
        edge_distance = edge_distance
    )
end

kmer_graph_distances (generic function with 1 method)

In [3]:
# set a random seed
seed = Random.seed!(0)

MersenneTwister(0)

In [4]:
# randomly generate a dna sequence of 100bp
genome = BioSequences.randdnaseq(seed, 100)

100nt DNA Sequence:
AAGGGTGCGGTCTAGGTGCACTGCTTATGGTCCCCGACA…TCTACCTCTTTGTTGAAACGTATTCTTCTCTTAACACCT

In [29]:
# define error rate
error_rate = 0.01

0.01

In [30]:
# generate 100x coverage fastq file
# put accuracy rate into fastq file
coverage = 10
fastq_file = "$(DIR)/$(DATE)-$(TASK).fastq"
error_free_fastq_file = "$(DIR)/$(DATE)-$(TASK).error-free.fastq"

fastq_io = FASTX.FASTQ.Writer(open(fastq_file, "w"))
error_free_fastq_io = FASTX.FASTQ.Writer(open(error_free_fastq_file, "w"))
for i in 1:coverage
    true_sequence = Mycelia.observe(genome, error_rate=0.0)
    quality_scores = fill(60, length(true_sequence))
    error_free_fastq_record = FASTX.FASTQ.Record("i", true_sequence, quality_scores)
    write(error_free_fastq_io, error_free_fastq_record)
    
    observed_sequence = Mycelia.observe(genome, error_rate=error_rate)
    q = -10 * log10(error_rate)
    quality_scores = fill(q, length(observed_sequence))
    fastq_record = FASTX.FASTQ.Record("i", observed_sequence, quality_scores)
    write(fastq_io, fastq_record)
end
close(fastq_io)
close(error_free_fastq_io)

In [31]:
k = 11

11

In [32]:
kmer_type = BioSequences.BigDNAMer{k}

BioSequences.BigDNAMer{11} (alias for BioSequences.BigMer{BioSequences.DNAAlphabet{2}, 11})

In [33]:
simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, fastq_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2957


{182, 376} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [34]:
# visualize
gfa_file = fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(simple_kmer_graph, gfa_file)

"/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.fastq.k-11.gfa"

In [35]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

Process(`[4m/Applications/Bandage.app/Contents/MacOS/Bandage[24m [4mimage[24m [4m/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.fastq.k-11.gfa[24m [4m/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.fastq.k-11.gfa.svg[24m [4m--depwidth[24m [4m1[24m [4m--deppower[24m [4m1[24m`, ProcessExited(0))

In [36]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

"./../../../../2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.fastq.k-11.gfa.svg"

In [37]:
x = display("text/html", "<img src=$(html_path_to_svg)>")

In [38]:
@time polished_fastq_file = Mycelia.simple_polish_fastq(simple_kmer_graph, fastq_file)

  0.018819 seconds (66.59 k allocations: 6.934 MiB)


"/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.k11.d3.fastq"

In [39]:
polished_simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, polished_fastq_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2957


{90, 178} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [40]:
# visualize
polished_gfa_file = polished_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(polished_simple_kmer_graph, polished_gfa_file)

"/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.k11.d3.fastq.k-11.gfa"

In [41]:
run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(polished_gfa_file) $(polished_gfa_file).svg --depwidth 1 --deppower 1`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

Process(`[4m/Applications/Bandage.app/Contents/MacOS/Bandage[24m [4mimage[24m [4m/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.k11.d3.fastq.k-11.gfa[24m [4m/Users/cameronprybol/2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.k11.d3.fastq.k-11.gfa.svg[24m [4m--depwidth[24m [4m1[24m [4m--deppower[24m [4m1[24m`, ProcessExited(0))

In [42]:
html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(polished_gfa_file).svg", "$(homedir())/" => "")

"./../../../../2021-06-26-assess-reconstruction-accuracy/2021-06-26-assess-reconstruction-accuracy.k11.d3.fastq.k-11.gfa.svg"

In [43]:
x = display("text/html", "<img src=$(html_path_to_svg)>")

In [44]:
error_free_simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, error_free_fastq_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2957


{90, 178} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [76]:
kmer_graph_distances(error_free_simple_kmer_graph, simple_kmer_graph)

(kmer_distance = 0.5054945054945055, edge_distance = 0.5265957446808511)

In [77]:
kmer_graph_distances(error_free_simple_kmer_graph, polished_simple_kmer_graph)

(kmer_distance = 0.0, edge_distance = 0.0)