In [1]:
DATE = "2021-06-26"
TASK = "1000bp-100x-coverage-0.01-error-rate"
DIR = "$(DATE)-$(TASK)"
DIR = mkpath("$(homedir())/$(DIR)")

"/Users/cameronprybol/2021-06-26-1000bp-100x-coverage-0.01-error-rate"

In [2]:
pkgs = [
"LightGraphs",
"MetaGraphs",
"BioSequences",
"Random",
"ProgressMeter",
"Revise",
"FASTX"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia

In [3]:
# set a random seed
seed = Random.seed!(0)

MersenneTwister(0)

In [4]:
# randomly generate a dna sequence of 100bp
genome = BioSequences.randdnaseq(seed, 1000)

1000nt DNA Sequence:
AGCATTGCGTAGTGAAGTTTACGGACATTCAACCGATCG…CGCCCGCACCAACCGAACAAGCCCCGATGTCACCTTAGT

In [5]:
# define error rate
error_rate = 0.01

0.01

In [6]:
# generate 100x coverage fastq file
# put accuracy rate into fastq file
coverage = 100
fastq_file = "$(DIR)/$(DATE)-$(TASK).fastq"
error_free_fastq_file = "$(DIR)/$(DATE)-$(TASK).error-free.fastq"

fastq_io = FASTX.FASTQ.Writer(open(fastq_file, "w"))
error_free_fastq_io = FASTX.FASTQ.Writer(open(error_free_fastq_file, "w"))
for i in 1:coverage
    true_sequence = Mycelia.observe(genome, error_rate=0.0)
    quality_scores = fill(60, length(true_sequence))
    error_free_fastq_record = FASTX.FASTQ.Record("$i", true_sequence, quality_scores)
    write(error_free_fastq_io, error_free_fastq_record)
    
    observed_sequence = Mycelia.observe(genome, error_rate=error_rate)
    q = -10 * log10(error_rate)
    quality_scores = fill(q, length(observed_sequence))
    fastq_record = FASTX.FASTQ.Record("$i", observed_sequence, quality_scores)
    write(fastq_io, fastq_record)
end
close(fastq_io)
close(error_free_fastq_io)

In [7]:
k = 13

13

In [8]:
kmer_type = BioSequences.BigDNAMer{k}

BioSequences.BigDNAMer{13} (alias for BioSequences.BigMer{BioSequences.DNAAlphabet{2}, 13})

In [9]:
function visualize_gfa_file(gfa_file)
    run(`/Applications/Bandage.app/Contents/MacOS/Bandage image $(gfa_file) $(gfa_file).svg --depwidth 1 --deppower 1`)
    # --nodewidth <float> Average node width (0.5 to 1000, default: 5)
    # --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
    # --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)
    html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
    html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")
    x = display("text/html", "<img src=$(html_path_to_svg)>")
end

visualize_gfa_file (generic function with 1 method)

In [10]:
error_free_simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, error_free_fastq_file)
# visualize
gfa_file = error_free_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(error_free_simple_kmer_graph, gfa_file)
visualize_gfa_file(gfa_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2958


In [11]:
simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, fastq_file)
gfa_file = fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(simple_kmer_graph, gfa_file)
visualize_gfa_file(gfa_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2958


In [13]:
@time polished_fastq_file = Mycelia.simple_polish_fastq(simple_kmer_graph, fastq_file, min_depth=7)

 22.524870 seconds (57.39 M allocations: 17.002 GiB, 17.43% gc time)


"/Users/cameronprybol/2021-06-26-1000bp-100x-coverage-0.01-error-rate/2021-06-26-1000bp-100x-coverage-0.01-error-rate.k13.d7.fastq"

In [14]:
polished_simple_kmer_graph = Mycelia.fastx_to_simple_kmer_graph(kmer_type, polished_fastq_file)
gfa_file = polished_fastq_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(polished_simple_kmer_graph, gfa_file)
visualize_gfa_file(gfa_file)

┌ Info: creating graph
└ @ Mycelia /Users/cameronprybol/.julia/dev/Mycelia/src/Mycelia.jl:2958


In [15]:
Mycelia.kmer_graph_distances(error_free_simple_kmer_graph, simple_kmer_graph)

(kmer_distance = 0.9096065873741994, edge_distance = 0.9157418473621308)

In [16]:
Mycelia.kmer_graph_distances(error_free_simple_kmer_graph, polished_simple_kmer_graph)

(kmer_distance = 0.0, edge_distance = 0.0)