In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "BioSequences",
    "Kmers",
    "Graphs",
    "MetaGraphs",
    "SparseArrays",
    "ProgressMeter",
    "Distributions",
    "HiddenMarkovModels",
    "BioAlignments",
    "StatsBase",
    "Random",
    "StatsPlots",
    "Statistics",
    # "GraphMakie",
    "IterTools",
    "Primes",
    "OnlineStats",
    "IteratorSampling",
    "HypothesisTests",
    "Distributions"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))
results_dir = joinpath(PROJECT_BASEDIR, "results")
working_dir = joinpath(data_dir, "test")
mkpath(working_dir)

In [None]:
# short_read_sets = unique(map(x -> match(r"^(.+\.\d+x)\.", x).captures[1], filter(x -> occursin(r"\.fna\.art", x) && occursin(r"\.fq\.gz", x) && !occursin("trimming_report", x) && !occursin("_val_", x), sort(readdir(genome_dir, join=true), by=x->filesize(x)))))
# # forward = short_read_set * ".1_val_1.fq.gz"
# # reverse = short_read_set * ".2_val_2.fq.gz"

In [None]:
long_read_fastqs = sort(filter(x -> occursin(r"\.filtlong\.fq\.gz$", x), readdir(genome_dir, join=true)), by=x->filesize(x))
fastq = long_read_fastqs[1]

In [None]:
reference_fasta = replace(fastq, r"\.badread.*" => "")

In [None]:
k = Mycelia.assess_dnamer_saturation([fastq])

kmer_type = Kmers.DNAKmer{k, 1}

# reference_kmers = Set(keys(fasta_to_reference_kmer_counts(kmer_type = kmer_type, fasta = reference_fasta)))
reference_kmer_counts = Mycelia.fasta_to_reference_kmer_counts(kmer_type=kmer_type, fasta=reference_fasta)

records = collect(Mycelia.open_fastx(fastq))

fit_mean = OnlineStats.fit!(OnlineStats.Mean(), IterTools.chain(FASTX.quality_scores(record) for record in records))
fit_mean_value = OnlineStats.value(fit_mean)

fit_extrema = OnlineStats.fit!(OnlineStats.Extrema(), IterTools.chain(FASTX.quality_scores(record) for record in records))

fit_variance = OnlineStats.fit!(OnlineStats.Variance(), IterTools.chain(FASTX.quality_scores(record) for record in records))
standard_deviation = sqrt(OnlineStats.value(fit_variance))

read_quality_scores = [collect(FASTX.quality_scores(record)) for record in records]

p = StatsPlots.scatter(
    IteratorSampling.itsample(IterTools.chain(read_quality_scores...), 10^4),
    title = "base quality scores",
    xlabel = "read index",
    ylabel = "quality score (PHRED)",
    # color = :black,
    alpha = 0.25,
    label = nothing)

StatsPlots.hline!(
    p,
    [OnlineStats.value(fit_mean)],
    labels = "mean = $(round(fit_mean_value, digits=1))",
    linestyle = :dash
)

one_sigma_quality = fit_mean_value - standard_deviation
StatsPlots.hline!(p, [one_sigma_quality], label = "(mean - 1σ) = $(round(one_sigma_quality, digits=1))")
StatsPlots.savefig(p, joinpath(results_dir, "base_quality_scores.svg"))
p

In [None]:
@assert one_sigma_quality > 0
OnlineStats.value(fit_mean) * k

In [None]:
k
one_sigma_quality * k

In [None]:
# make a dictionary associating all kmers with their quality scores
all_kmer_quality_support = Dict{kmer_type, Vector{Float64}}()
for record in records
    record_quality_scores = collect(FASTX.quality_scores(record))
    record_quality_score_slices = [record_quality_scores[i:i+k-1] for i in 1:length(record_quality_scores)-k+1]
    sequence = BioSequences.LongDNA{2}(FASTX.sequence(record))
    for ((i, kmer), kmer_base_qualities) in zip(Kmers.EveryKmer{kmer_type}(sequence), record_quality_score_slices)
        if haskey(all_kmer_quality_support, kmer)
            all_kmer_quality_support[kmer] = all_kmer_quality_support[kmer] .+ kmer_base_qualities
        else
            all_kmer_quality_support[kmer] = kmer_base_qualities
        end
    end
end
all_kmer_quality_support
kmer_counts = Mycelia.count_kmers(kmer_type, fastq)
kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(kmer_counts)))

# StatsPlots.histogram(
#     collect(values(kmer_counts)),
#     legend=false,
#     title = "kmer counts",
#     xlabel = "# of occurances",
#     ylabel = "# of kmers",
#     # yscale = :log2,
#     ylims = (0, maximum(collect(values(kmer_counts))))
#     # color = :black
# )

canonical_kmer_counts = Mycelia.count_canonical_kmers(kmer_type, fastq)
canonical_kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(canonical_kmer_counts)))

# StatsPlots.histogram(
#     collect(values(canonical_kmer_counts)),
#     legend=false,
#     title = "canonical kmer counts",
#     xlabel = "# of occurances",
#     ylabel = "# of kmers",
#     yscale = :log2
#     # color = :black
# )

valid_kmer_counts = [count for (kmer, count) in kmer_counts if (kmer in keys(reference_kmer_counts))]
invalid_kmer_counts = [count for (kmer, count) in kmer_counts if !(kmer in keys(reference_kmer_counts))]

p = StatsPlots.density(
    [invalid_kmer_counts, valid_kmer_counts],
    title = "canonical kmer counts",
    legend = :topright,
    labels = ["sequencing artifacts" "valid kmer counts"],
    xlabel = "# of occurances",
    ylabel = "# of kmers",
)
StatsPlots.savefig(p, joinpath(results_dir, "canonical_kmer_counts.svg"))
p

In [None]:
reference_kmers = sort(collect(keys(reference_kmer_counts)))

valid_average_qualities = Float64[]
invalid_average_qualities = Float64[]
for (kmer, quality_values) in all_kmer_quality_support
    averages = Statistics.mean.(quality_values)
    if kmer in reference_kmers
        append!(valid_average_qualities, averages)
    else
        append!(invalid_average_qualities, averages)
    end
end

p = StatsPlots.scatter(
    [Mycelia.jitter(2, length(invalid_average_qualities)), Mycelia.jitter(1, length(valid_average_qualities))],
    [invalid_average_qualities, valid_average_qualities],
    alpha=0.2,
    title = "Mean of cumulative FASTQ quality scores\nstrand-specific",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_average_qualities), Statistics.mean(valid_average_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_average_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_average_qualities), Statistics.mean(invalid_average_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_average_qualities), digits=3))")

StatsPlots.savefig(p, joinpath(results_dir, "mean-kmer-qual-scores-stranded.svg"))
p

In [None]:
strand_normalized_quality_support = Dict{kmer_type, Vector{Float64}}()
for (kmer, support) in all_kmer_quality_support
    strand_normalized_quality_support[kmer] = support
    if haskey(all_kmer_quality_support, BioSequences.reverse_complement(kmer))
        strand_normalized_quality_support[kmer] .+= all_kmer_quality_support[BioSequences.reverse_complement(kmer)]
    end
end

valid_average_qualities = Float64[]
invalid_average_qualities = Float64[]
for (kmer, quality_values) in strand_normalized_quality_support
    averages = Statistics.mean.(quality_values)
    if kmer in reference_kmers
        append!(valid_average_qualities, averages)
    else
        append!(invalid_average_qualities, averages)
    end
end

p = StatsPlots.scatter(
    [Mycelia.jitter(2, length(invalid_average_qualities)), Mycelia.jitter(1, length(valid_average_qualities))],
    [invalid_average_qualities, valid_average_qualities],
    alpha=0.2,
    title = "Mean of cumulative FASTQ quality scores\nreverse-complement-aware",
    # title = "Average adjusted joint-Q value for each Kmer",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_average_qualities), Statistics.mean(valid_average_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_average_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_average_qualities), Statistics.mean(invalid_average_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_average_qualities), digits=3))")

StatsPlots.savefig(p, joinpath(results_dir, "mean-kmer-qual-scores-shared.svg"))
p

In [None]:
valid_total_qualities = Float64[]
invalid_total_qualities = Float64[]
for (kmer, quality_values) in strand_normalized_quality_support
    if kmer in reference_kmers
        append!(valid_total_qualities, sum(quality_values))
    else
        append!(invalid_total_qualities, sum(quality_values))
    end
end

p = StatsPlots.scatter(
    [Mycelia.jitter(2, length(invalid_total_qualities)), Mycelia.jitter(1, length(valid_total_qualities))],
    [invalid_total_qualities, valid_total_qualities],
    alpha=0.2,
    title = "Total of cumulative FASTQ quality scores\nreverse-complement-aware",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_total_qualities), Statistics.mean(valid_total_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_total_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_total_qualities), Statistics.mean(invalid_total_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_total_qualities), digits=3))")

StatsPlots.savefig(p, joinpath(results_dir, "total-kmer-qual-scores-shared.svg"))
p

In [None]:
kmer_total_quality = Dict(kmer => sum(quality_values) for (kmer, quality_values) in strand_normalized_quality_support)
# state_likelihoods = Dict(kmer => kmer_count / total_kmers for (kmer, kmer_count) in kmer_counts)
state_likelihoods = Dict(kmer => total_quality / sum(values(kmer_total_quality)) for (kmer, total_quality) in kmer_total_quality)

total_states = length(state_likelihoods)

transition_likelihoods = SparseArrays.spzeros(total_states, total_states)
for record in records
    sequence = BioSequences.LongDNA{4}(FASTX.sequence(record))
    sources = Kmers.EveryKmer{kmer_type}(sequence[1:end-1])
    destinations = Kmers.EveryKmer{kmer_type}(sequence[2:end])
    for ((source_i, source), (destination_i, destination)) in zip(sources, destinations)
        source_index = kmer_indices[source]
        destination_index = kmer_indices[destination]
        transition_likelihoods[source_index, destination_index] += 1
    end
end
for source in 1:total_states
    # @show source
    outgoing_transition_counts = transition_likelihoods[source, :]
    if sum(outgoing_transition_counts) > 0
        transition_likelihoods[source, :] .= transition_likelihoods[source, :] ./ sum(transition_likelihoods[source, :]) 
    end
end
transition_likelihoods

g = Graphs.SimpleDiGraph(total_states)
row_indices, column_indices, cell_values = SparseArrays.findnz(transition_likelihoods)
for (row, col) in zip(row_indices, column_indices)
    Graphs.add_edge!(g, row, col)
end
g

unbranching_nodes = Set(Int[])
for node in Graphs.vertices(g)
    if (Graphs.indegree(g, node) <= 1) && (Graphs.outdegree(g, node) <= 1)
        push!(unbranching_nodes, node)
    end
end
unvisited_unbranching_nodes = Set(unbranching_nodes)

branching_nodes = setdiff(Graphs.vertices(g), unbranching_nodes)
for branching_node in branching_nodes
    @assert Graphs.degree(g, branching_node) >= 2
end
inbranching_nodes = filter(node -> Graphs.indegree(g, node) > 1, branching_nodes)
outbranching_nodes = filter(node -> Graphs.outdegree(g, node) > 1, branching_nodes)

unbranching_paths = []
while !isempty(unvisited_unbranching_nodes)
    current_path = [rand(unvisited_unbranching_nodes)]
    delete!(unvisited_unbranching_nodes, first(current_path))

    outneighbors = Graphs.outneighbors(g, last(current_path))

    while length(outneighbors) == 1
        outneighbor = first(outneighbors)
        outneighbors_inneighbors = Graphs.inneighbors(g, outneighbor)
        if outneighbors_inneighbors == [last(current_path)]
            push!(current_path, outneighbor)
            delete!(unvisited_unbranching_nodes, outneighbor)
            outneighbors = Graphs.outneighbors(g, outneighbor)
        else
            @assert length(outneighbors_inneighbors) > 1
            push!(current_path, outneighbor)
            delete!(unvisited_unbranching_nodes, outneighbor)
            break
        end
    end
    inneighbors = Graphs.inneighbors(g, first(current_path))
    while length(inneighbors) == 1
        inneighbor = first(inneighbors)
        inneighbors_outneighbors = Graphs.outneighbors(g, inneighbor)
        if inneighbors_outneighbors == [first(current_path)]
            pushfirst!(current_path, inneighbor)
            delete!(unvisited_unbranching_nodes, inneighbor)
            inneighbors = Graphs.inneighbors(g, inneighbor)
        else
            @assert length(inneighbors_outneighbors) > 1
            pushfirst!(current_path, inneighbor)
            delete!(unvisited_unbranching_nodes, inneighbor)
            break
        end
    end
    push!(unbranching_paths, current_path)
end
unbranching_paths

unbranching_path_scores = Float64[]
state_scores = collect(values(sort(kmer_total_quality)))
for unbranching_path in unbranching_paths
    push!(unbranching_path_scores, Statistics.mean(state_scores[state] for state in unbranching_path))
end
StatsPlots.histogram(unbranching_path_scores)

ordered_kmers = collect(keys(kmer_counts))
solid_states = findall(x -> x in reference_kmers, ordered_kmers)

unbranching_path_solidity = [count(s -> s in solid_states, unbranching_path)/length(unbranching_path) for unbranching_path in unbranching_paths]

solid_unbranching_paths = findall(unbranching_path_solidity .== 1.0)

valid_unbranching_path_scores = Float64[]
invalid_unbranching_path_scores = Float64[]
for (i, s) in enumerate(unbranching_path_scores)
    if i in solid_unbranching_paths
        push!(valid_unbranching_path_scores, s)
    else
        push!(invalid_unbranching_path_scores, s)
    end
end

p = StatsPlots.scatter(
    [Mycelia.jitter(2, length(invalid_unbranching_path_scores)), Mycelia.jitter(1, length(valid_unbranching_path_scores))],
    [invalid_unbranching_path_scores, valid_unbranching_path_scores],
    alpha=0.2,
    title = "Mean of cumulative FASTQ quality scores\nreverse-complement-aware",
    xticks = ((1, 2), ("valid paths", "paths containing sequencing artifacts")),
    labels = nothing
)


StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_unbranching_path_scores), Statistics.mean(valid_unbranching_path_scores)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_unbranching_path_scores), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_unbranching_path_scores), Statistics.mean(invalid_unbranching_path_scores)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_unbranching_path_scores), digits=3))")

StatsPlots.savefig(p, joinpath(results_dir, "untig-kmer-qual-scores-shared.svg"))
p

In [None]:
total_strand_normalized_quality_support = sum.(collect(values(strand_normalized_quality_support)))
# minimum_average = min(Statistics.mean(total_strand_normalized_quality_support), Statistics.median(total_strand_normalized_quality_support))
mean_total_support = Statistics.mean(total_strand_normalized_quality_support)
Statistics.std(total_strand_normalized_quality_support)
test_is_single_distribution = HypothesisTests.ExactOneSampleKSTest(total_strand_normalized_quality_support, Distributions.Normal())
if HypothesisTests.pvalue(test_is_single_distribution) < 1e-3
    @show "p = $(HypothesisTests.pvalue(test_is_single_distribution)) rejecting error-free hypothesis & entering error correction"
else
    @show "single distribution detected, this data may be error-free"
end