In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "BioSequences",
    "Kmers",
    "Graphs",
    "MetaGraphs",
    "SparseArrays",
    "ProgressMeter",
    "Distributions",
    "HiddenMarkovModels",
    "BioAlignments",
    "StatsBase",
    "Random",
    "StatsPlots",
    "Statistics",
    # "GraphMakie",
    "IterTools",
    "Primes",
    "OnlineStats",
    "IteratorSampling"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
working_dir = joinpath(data_dir, "test")
mkpath(working_dir)

In [None]:
# short_read_sets = unique(map(x -> match(r"^(.+\.\d+x)\.", x).captures[1], filter(x -> occursin(r"\.fna\.art", x) && occursin(r"\.fq\.gz", x) && !occursin("trimming_report", x) && !occursin("_val_", x), sort(readdir(genome_dir, join=true), by=x->filesize(x)))))
# # forward = short_read_set * ".1_val_1.fq.gz"
# # reverse = short_read_set * ".2_val_2.fq.gz"

In [None]:
long_read_fastqs = sort(filter(x -> occursin(r"\.filtlong\.fq\.gz$", x), readdir(genome_dir, join=true)), by=x->filesize(x))
fastq = long_read_fastqs[1]

In [None]:
reference_fasta = replace(fastq, r"\.badread.*" => "")

In [None]:
k = Mycelia.assess_dnamer_saturation([fastq])

In [None]:
kmer_type = Kmers.DNAKmer{k, 1}

In [None]:
function fasta_to_reference_kmer_counts(;kmer_type, fasta)
    kmer_counts = Dict{kmer_type, Int}()
    for record in Mycelia.open_fastx(fasta)
        record_sequence = BioSequences.LongDNA{2}(FASTX.sequence(record))
        forward_counts = StatsBase.countmap(kmer for (i, kmer) in Kmers.EveryKmer{kmer_type}(record_sequence))
        reverse_counts = StatsBase.countmap(kmer for (i, kmer) in Kmers.EveryKmer{kmer_type}(BioSequences.reverse_complement(record_sequence)))
        record_counts = merge(+, forward_counts, reverse_counts)
        merge!(+, kmer_counts, record_counts)
    end
    return kmer_counts
end

# reference_kmers = Set(keys(fasta_to_reference_kmer_counts(kmer_type = kmer_type, fasta = reference_fasta)))
reference_kmer_counts = fasta_to_reference_kmer_counts(kmer_type=kmer_type, fasta=reference_fasta)

In [None]:
records = collect(Mycelia.open_fastx(fastq))

In [None]:
fit_mean = OnlineStats.fit!(OnlineStats.Mean(), IterTools.chain(FASTX.quality_scores(record) for record in records))

In [None]:
fit_extrema = OnlineStats.fit!(OnlineStats.Extrema(), IterTools.chain(FASTX.quality_scores(record) for record in records))

In [None]:
fit_variance = OnlineStats.fit!(OnlineStats.Variance(), IterTools.chain(FASTX.quality_scores(record) for record in records))
standard_deviation = sqrt(OnlineStats.value(fit_variance))

In [None]:
read_quality_scores = [collect(FASTX.quality_scores(record)) for record in records]

In [None]:
p = StatsPlots.scatter(
    IteratorSampling.itsample(IterTools.chain(read_quality_scores...), 10^4),
    title = "base quality scores",
    xlabel = "read index",
    ylabel = "quality score (PHRED)",
    # color = :black,
    alpha = 0.25,
    label = nothing)

StatsPlots.hline!(
    p,
    [OnlineStats.value(fit_mean)],
    labels = "mean = $(round(OnlineStats.value(fit_mean), digits=3))",
    linestyle = :dash
)

StatsPlots.hline!(p, [floor(OnlineStats.value(fit_mean) - standard_deviation)], label = "(mean - 1σ)")

In [None]:
# make a dictionary associating all kmers with their quality scores
all_kmer_quality_support = Dict{kmer_type, Vector{Float64}}()
for record in records
    record_quality_scores = collect(FASTX.quality_scores(record))
    record_quality_score_slices = [record_quality_scores[i:i+k-1] for i in 1:length(record_quality_scores)-k+1]
    sequence = BioSequences.LongDNA{2}(FASTX.sequence(record))
    for ((i, kmer), kmer_base_qualities) in zip(Kmers.EveryKmer{kmer_type}(sequence), record_quality_score_slices)
        if haskey(all_kmer_quality_support, kmer)
            all_kmer_quality_support[kmer] = all_kmer_quality_support[kmer] .+ kmer_base_qualities
        else
            all_kmer_quality_support[kmer] = kmer_base_qualities
        end
    end
end
all_kmer_quality_support

In [None]:
kmer_counts = Mycelia.count_kmers(kmer_type, fastq)

In [None]:
kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(kmer_counts)))

StatsPlots.histogram(
    collect(values(kmer_counts)),
    legend=false,
    title = "kmer counts",
    xlabel = "# of occurances",
    ylabel = "# of kmers",
    yscale = :log2
    # color = :black
)

In [None]:
canonical_kmer_counts = Mycelia.count_canonical_kmers(kmer_type, fastq)
kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(canonical_kmer_counts)))

StatsPlots.histogram(
    collect(values(canonical_kmer_counts)),
    legend=false,
    title = "canonical kmer counts",
    xlabel = "# of occurances",
    ylabel = "# of kmers",
    yscale = :log2
    # color = :black
)

In [None]:
valid_kmer_counts = [count for (kmer, count) in kmer_counts if !(kmer in keys(reference_kmer_counts))]
invalid_kmer_counts = [count for (kmer, count) in kmer_counts if (kmer in keys(reference_kmer_counts))]

StatsPlots.density(
    [valid_kmer_counts, invalid_kmer_counts],
    title = "kmer counts",
    legend = :topright,
    labels = ["valid kmer counts" "sequencing artifacts"],
    xlabel = "# of occurances",
    ylabel = "# of kmers",
)

In [None]:
all_kmer_quality_support

function jitter(x, n)
    return [x + rand() / 3 * (ifelse(rand(Bool), 1, -1)) for i in 1:n]
end

reference_kmers = sort(collect(keys(reference_kmer_counts)))

valid_average_qualities = Float64[]
invalid_average_qualities = Float64[]
for (kmer, quality_values) in all_kmer_quality_support
    averages = Statistics.mean.(quality_values)
    if kmer in reference_kmers
        append!(valid_average_qualities, averages)
    else
        append!(invalid_average_qualities, averages)
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_average_qualities)), jitter(1, length(valid_average_qualities))],
    [invalid_average_qualities, valid_average_qualities],
    alpha=0.2,
    title = "Average joint-Q value for each Kmer",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_average_qualities), Statistics.mean(valid_average_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_average_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_average_qualities), Statistics.mean(invalid_average_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_average_qualities), digits=3))")

In [None]:
strand_normalized_quality_support = Dict{kmer_type, Vector{Float64}}()
for (kmer, support) in all_kmer_quality_support
    strand_normalized_quality_support[kmer] = support
    if haskey(all_kmer_quality_support, BioSequences.reverse_complement(kmer))
        strand_normalized_quality_support[kmer] .+= all_kmer_quality_support[BioSequences.reverse_complement(kmer)]
    end
end
strand_normalized_quality_support

function jitter(x, n)
    return [x + rand() / 3 * (ifelse(rand(Bool), 1, -1)) for i in 1:n]
end

reference_kmers = sort(collect(keys(reference_kmer_counts)))

valid_average_qualities = Float64[]
invalid_average_qualities = Float64[]
for (kmer, quality_values) in strand_normalized_quality_support
    averages = Statistics.mean.(quality_values)
    if kmer in reference_kmers
        append!(valid_average_qualities, averages)
    else
        append!(invalid_average_qualities, averages)
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_average_qualities)), jitter(1, length(valid_average_qualities))],
    [invalid_average_qualities, valid_average_qualities],
    alpha=0.2,
    title = "Average adjusted joint-Q value for each Kmer",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_average_qualities), Statistics.mean(valid_average_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_average_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_average_qualities), Statistics.mean(invalid_average_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_average_qualities), digits=3))")

In [None]:
valid_total_qualities = Float64[]
invalid_total_qualities = Float64[]
for (kmer, quality_values) in strand_normalized_quality_support
    if kmer in reference_kmers
        append!(valid_total_qualities, sum(quality_values))
    else
        append!(invalid_total_qualities, sum(quality_values))
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_total_qualities)), jitter(1, length(valid_total_qualities))],
    [invalid_total_qualities, valid_total_qualities],
    alpha=0.2,
    title = "Total adjusted joint-Q value for each Kmer",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_total_qualities), Statistics.mean(valid_total_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_total_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_total_qualities), Statistics.mean(invalid_total_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_total_qualities), digits=3))")

In [None]:
# here we ask what is the the minimum quality of the BEST observation for all considered kmers
valid_max_minimum_qualities = Float64[]
invalid_max_minimum_qualities = Float64[]
for (kmer, quality_values) in all_kmer_quality_support
    minimums = minimum.(quality_values)
    if kmer in reference_kmers
        push!(valid_max_minimum_qualities, maximum(minimums))
    else
        push!(invalid_max_minimum_qualities, maximum(minimums))
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_max_minimum_qualities)), jitter(1, length(valid_max_minimum_qualities))],
    [invalid_max_minimum_qualities, valid_max_minimum_qualities],
    alpha=0.2,
    title = "minimum Q value for the best observation",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_max_minimum_qualities), Statistics.mean(valid_max_minimum_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_max_minimum_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_max_minimum_qualities), Statistics.mean(invalid_max_minimum_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_max_minimum_qualities), digits=3))")

In [None]:
# here we ask what is the average quality of all of the considered kmers
valid_average_qualities = Float64[]
invalid_average_qualities = Float64[]
for (kmer, quality_values) in all_kmer_quality_support
    means = Statistics.mean.(quality_values)
    if kmer in reference_kmers
        append!(valid_average_qualities, means)
    else
        append!(invalid_average_qualities, means)
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_average_qualities)), jitter(1, length(valid_average_qualities))],
    [invalid_average_qualities, valid_average_qualities],
    alpha=0.2,
    title = "Average Q value for all observations",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_average_qualities), Statistics.mean(valid_average_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_average_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_average_qualities), Statistics.mean(invalid_average_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_average_qualities), digits=3))")

In [None]:
# average quality of the BEST observation
valid_max_average_qualities = Float64[]
invalid_max_average_qualities = Float64[]
for (kmer, quality_values) in all_kmer_quality_support
    means = Statistics.mean.(quality_values)
    if kmer in reference_kmers
        push!(valid_max_average_qualities, maximum(means))
    else
        push!(invalid_max_average_qualities, maximum(means))
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_max_average_qualities)), jitter(1, length(valid_max_average_qualities))],
    [invalid_max_average_qualities, valid_max_average_qualities],
    alpha=0.2,
    title = "Average Q value for best observations",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_max_average_qualities), Statistics.mean(valid_max_average_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_max_average_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_max_average_qualities), Statistics.mean(invalid_max_average_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_max_average_qualities), digits=3))")

In [None]:
# here we ask what is the total quality of all of the considered kmers

state_quality_scores = sort(Dict(kmer => sum(sum.(quality_scores)) for (kmer, quality_scores) in all_kmer_quality_support))

valid_total_qualities = Float64[]
invalid_total_qualities = Float64[]
for (kmer, quality_values) in all_kmer_quality_support
    if kmer in reference_kmers
        push!(valid_total_qualities, state_quality_scores[kmer])
    else
        push!(invalid_total_qualities, state_quality_scores[kmer])
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_total_qualities)), jitter(1, length(valid_total_qualities))],
    [invalid_total_qualities, valid_total_qualities],
    alpha=0.2,
    title = "Total Q value sum across all observations",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing,
    ylims = (0, maximum(valid_total_qualities))
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_total_qualities), Statistics.mean(valid_total_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_total_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_total_qualities), Statistics.mean(invalid_total_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_total_qualities), digits=3))")

In [None]:
# Now find the reverse-complement-aware total quality values

In [None]:
total_kmers = sum(values(kmer_counts))
total_states = length(kmer_counts)
kmers = collect(keys(kmer_counts))

adjusted_state_quality_scores = Dict{kmer_type, Float64}()
for kmer in kmers
    adjusted_state_quality_scores[kmer] = state_quality_scores[kmer]
    if haskey(adjusted_state_quality_scores, BioSequences.reverse_complement(kmer))
        adjusted_state_quality_scores[kmer] += state_quality_scores[BioSequences.reverse_complement(kmer)]
    end
end
adjusted_state_quality_scores

# here we ask what is the total quality of all of the considered kmers
valid_adjusted_qualities = Float64[]
invalid_adjusted_qualities = Float64[]
for kmer in kmers
    if kmer in reference_kmers
        push!(valid_adjusted_qualities, adjusted_state_quality_scores[kmer])
    else
        push!(invalid_adjusted_qualities, adjusted_state_quality_scores[kmer])
    end
end

p = StatsPlots.scatter(
    [jitter(2, length(invalid_adjusted_qualities)), jitter(1, length(valid_adjusted_qualities))],
    [invalid_adjusted_qualities, valid_adjusted_qualities],
    alpha=0.2,
    title = "Total Q value sum across all observations",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    labels = nothing,
    ylims = (0, maximum(valid_adjusted_qualities))
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_adjusted_qualities), Statistics.mean(valid_adjusted_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_adjusted_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_adjusted_qualities), Statistics.mean(invalid_adjusted_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_adjusted_qualities), digits=3))")