In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "BioSequences",
    "Kmers",
    "Graphs",
    "MetaGraphs",
    "SparseArrays",
    "ProgressMeter",
    "Distributions",
    "HiddenMarkovModels",
    "BioAlignments",
    "StatsBase",
    "Random",
    "StatsPlots",
    "Statistics",
    # "GraphMakie",
    "IterTools",
    "Primes",
    "OnlineStats",
    "IteratorSampling",
    "HypothesisTests",
    "Distributions"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))
results_dir = joinpath(PROJECT_BASEDIR, "results")
working_dir = joinpath(data_dir, "test")
mkpath(working_dir)

In [None]:
# short_read_sets = unique(map(x -> match(r"^(.+\.\d+x)\.", x).captures[1], filter(x -> occursin(r"\.fna\.art", x) && occursin(r"\.fq\.gz", x) && !occursin("trimming_report", x) && !occursin("_val_", x), sort(readdir(genome_dir, join=true), by=x->filesize(x)))))
# # forward = short_read_set * ".1_val_1.fq.gz"
# # reverse = short_read_set * ".2_val_2.fq.gz"

In [None]:
long_read_fastqs = sort(filter(x -> occursin(r"\.filtlong\.fq\.gz$", x), readdir(genome_dir, join=true)), by=x->filesize(x))
fastq = long_read_fastqs[1]

In [None]:
reference_fasta = replace(fastq, r"\.badread.*" => "")

In [None]:
fontsize = 14

In [None]:
k = Mycelia.assess_dnamer_saturation([fastq])

kmer_type = Kmers.DNAKmer{k, 1}

# reference_kmers = Set(keys(fasta_to_reference_kmer_counts(kmer_type = kmer_type, fasta = reference_fasta)))
reference_kmer_counts = Mycelia.fasta_to_reference_kmer_counts(kmer_type=kmer_type, fasta=reference_fasta)

records = collect(Mycelia.open_fastx(fastq))

fit_mean = OnlineStats.fit!(OnlineStats.Mean(), IterTools.chain(FASTX.quality_scores(record) for record in records))
fit_mean_value = OnlineStats.value(fit_mean)

fit_extrema = OnlineStats.fit!(OnlineStats.Extrema(), IterTools.chain(FASTX.quality_scores(record) for record in records))

fit_variance = OnlineStats.fit!(OnlineStats.Variance(), IterTools.chain(FASTX.quality_scores(record) for record in records))
standard_deviation = sqrt(OnlineStats.value(fit_variance))

read_quality_scores = [collect(FASTX.quality_scores(record)) for record in records]

p = StatsPlots.scatter(
    IteratorSampling.itsample(IterTools.chain(read_quality_scores...), 10^4),
    title = "base quality scores",
    xlabel = "read index",
    ylabel = "quality score (PHRED)",
    # color = :black,
    xtickfontsize = fontsize-2,
    ytickfontsize = fontsize-2,
    xguidefontsize = fontsize,
    yguidefontsize = fontsize,
    titlefontsize = fontsize,
    legendfontsize = fontsize,
    alpha = 0.25,
    label = nothing)

StatsPlots.hline!(
    p,
    [OnlineStats.value(fit_mean)],
    labels = "mean = $(round(fit_mean_value, digits=1))",
    linestyle = :dash
)

one_sigma_quality = fit_mean_value - standard_deviation
StatsPlots.hline!(p, [one_sigma_quality], label = "(mean - 1σ) = $(round(one_sigma_quality, digits=1))")
StatsPlots.savefig(p, joinpath(results_dir, "base_quality_scores.svg"))
p

In [None]:
@assert one_sigma_quality > 0
OnlineStats.value(fit_mean) * k

In [None]:
k
one_sigma_quality * k

In [None]:
# make a dictionary associating all kmers with their quality scores
all_kmer_quality_support = Dict{kmer_type, Vector{Float64}}()
for record in records
    record_quality_scores = collect(FASTX.quality_scores(record))
    record_quality_score_slices = [record_quality_scores[i:i+k-1] for i in 1:length(record_quality_scores)-k+1]
    sequence = BioSequences.LongDNA{2}(FASTX.sequence(record))
    for ((i, kmer), kmer_base_qualities) in zip(Kmers.EveryKmer{kmer_type}(sequence), record_quality_score_slices)
        if haskey(all_kmer_quality_support, kmer)
            all_kmer_quality_support[kmer] = all_kmer_quality_support[kmer] .+ kmer_base_qualities
        else
            all_kmer_quality_support[kmer] = kmer_base_qualities
        end
    end
end
all_kmer_quality_support
kmer_counts = Mycelia.count_kmers(kmer_type, fastq)
kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(kmer_counts)))

# StatsPlots.histogram(
#     collect(values(kmer_counts)),
#     legend=false,
#     title = "kmer counts",
#     xlabel = "# of occurances",
#     ylabel = "# of kmers",
#     # yscale = :log2,
#     ylims = (0, maximum(collect(values(kmer_counts))))
#     # color = :black
# )

canonical_kmer_counts = Mycelia.count_canonical_kmers(kmer_type, fastq)
canonical_kmer_indices = Dict(kmer => i for (i, kmer) in enumerate(keys(canonical_kmer_counts)))

# StatsPlots.histogram(
#     collect(values(canonical_kmer_counts)),
#     legend=false,
#     title = "canonical kmer counts",
#     xlabel = "# of occurances",
#     ylabel = "# of kmers",
#     yscale = :log2
#     # color = :black
# )

valid_kmer_counts = [count for (kmer, count) in kmer_counts if (kmer in keys(reference_kmer_counts))]
invalid_kmer_counts = [count for (kmer, count) in kmer_counts if !(kmer in keys(reference_kmer_counts))]

p = StatsPlots.density(
    [invalid_kmer_counts, valid_kmer_counts],
    title = "canonical kmer counts",
    legend = :topright,
    labels = ["sequencing artifacts" "valid kmer counts"],
    xlabel = "# of occurances",
    ylabel = "# of kmers",
    xtickfontsize = fontsize-2,
    ytickfontsize = fontsize-2,
    xguidefontsize = fontsize,
    yguidefontsize = fontsize,
    titlefontsize = fontsize,
    legendfontsize = fontsize,
)
StatsPlots.savefig(p, joinpath(results_dir, "canonical_kmer_counts.svg"))
p

In [None]:
reference_kmers = sort(collect(keys(reference_kmer_counts)))

valid_average_qualities = Float64[]
invalid_average_qualities = Float64[]
for (kmer, quality_values) in all_kmer_quality_support
    averages = Statistics.mean.(quality_values)
    if kmer in reference_kmers
        append!(valid_average_qualities, averages)
    else
        append!(invalid_average_qualities, averages)
    end
end

p = StatsPlots.scatter(
    [Mycelia.jitter(2, length(invalid_average_qualities)), Mycelia.jitter(1, length(valid_average_qualities))],
    [invalid_average_qualities, valid_average_qualities],
    alpha=0.2,
    title = "Mean across observations - strand-specific",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    ylabel = "kmer quality score",
    labels = nothing,
    xtickfontsize = fontsize-2,
    ytickfontsize = fontsize-2,
    xguidefontsize = fontsize,
    yguidefontsize = fontsize,
    titlefontsize = fontsize,
    legendfontsize = fontsize,
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_average_qualities), Statistics.mean(valid_average_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_average_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_average_qualities), Statistics.mean(invalid_average_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_average_qualities), digits=3))")

StatsPlots.savefig(p, joinpath(results_dir, "mean-kmer-qual-scores-stranded.svg"))
p

In [None]:
valid_total_qualities = Float64[]
invalid_total_qualities = Float64[]
for (kmer, quality_values) in strand_normalized_quality_support
    if kmer in reference_kmers
        append!(valid_total_qualities, sum(quality_values))
    else
        append!(invalid_total_qualities, sum(quality_values))
    end
end

p = StatsPlots.scatter(
    [Mycelia.jitter(2, length(invalid_total_qualities)), Mycelia.jitter(1, length(valid_total_qualities))],
    [invalid_total_qualities, valid_total_qualities],
    alpha=0.2,
    title = "Total across observations - canonical",
    xticks = ((1, 2), ("valid kmers", "sequencing artifacts")),
    ylabel = "kmer quality score",
    labels = nothing,
    xtickfontsize = fontsize-2,
    ytickfontsize = fontsize-2,
    xguidefontsize = fontsize,
    yguidefontsize = fontsize,
    titlefontsize = fontsize,
    legendfontsize = fontsize,
)
StatsPlots.plot!(p, 
    [0.75, 1.25],
    [Statistics.mean(valid_total_qualities), Statistics.mean(valid_total_qualities)],
    linewidth=4,
    color=:orange,
    label = "mean = $(round(Statistics.mean(valid_total_qualities), digits=3))")
StatsPlots.plot!(p,
    [1.75, 2.25],
    [Statistics.mean(invalid_total_qualities), Statistics.mean(invalid_total_qualities)],
    linewidth=4,
    color=:blue,
    label="mean = $(round(Statistics.mean(invalid_total_qualities), digits=3))")

StatsPlots.savefig(p, joinpath(results_dir, "total-kmer-qual-scores-shared.svg"))
p