In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "uCSV",
    "Kmers",
    "CodecZlib",
    "StatsPlots",
    "GLM",
    "StatsBase",
    "CSV"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
refseq_metadata = Mycelia.load_refseq_metadata();

In [None]:
function accession_to_kmer_frequency_plots(;accession, basedir=pwd(), outdir=joinpath(basedir, accession), ks=[7, 11, 13, 17])
    outdir = mkpath(joinpath(genome_dir, accession))
    row = findfirst((refseq_metadata[!, "refseq_category"] .== "reference genome") .& map(x -> x == accession, refseq_metadata[!, "#assembly_accession"]))
    fasta = Mycelia.download_genome_by_ftp(ftp=refseq_metadata[row, "ftp_path"], outdir=outdir)
    organism_name = refseq_metadata[row, "organism_name"]
    group_name = refseq_metadata[row, "group"]

    kmer_counts_dict = Dict()
    for k in [7, 11, 13, 17]
        @time jellyfish_counts_file = Mycelia.jellyfish_count(fastx=fasta, k = k, canonical=true)
        @time count_histogram_file = Mycelia.jellyfish_counts_to_kmer_frequency_histogram(jellyfish_counts_file)
        kmer_counts_dict[k] = CSV.read(count_histogram_file, DataFrames.DataFrame, delim='\t')
    end
    kmer_counts_dict
    
    p = StatsPlots.plot(
        layout = (length(ks), 1),
        xlabel = "log2(observed frequency)",
        ylabel = "log2(# of kmers)"
    )
    x_max = 0.0
    y_max = 0.0
    for (i, k) in enumerate(ks)
        kmer_counts_histogram_table = kmer_counts_dict[k]
        xs = log2.(kmer_counts_histogram_table[!, "number of observations"])
        ys = log2.(kmer_counts_histogram_table[!, "number of kmers"])
        model = GLM.lm(GLM.@formula(_ys ~ _xs), DataFrames.DataFrame(_xs=xs, _ys=ys))
        r2_value = GLM.r2(model)
        x_max = max(x_max, maximum(xs))
        y_max = max(y_max, maximum(ys))
        StatsPlots.plot!(
            p,
            xs,
            ys,
            seriestype = :scatter,
            title = "$(group_name) $(organism_name) k=$(k)",
            label = missing,
            subplot = i
        )
        StatsPlots.plot!(
            p,
            xs,
            GLM.predict(model),
            label="r=$(round(r2_value, digits=3))",
            subplot = i
        )
    end
    p = StatsPlots.plot!(
        p,
        xlims = (0, x_max + max(1, ceil(0.1 * x_max))),
        ylims = (0, y_max + max(1, ceil(0.1 * y_max))),
        size = (600, length(ks) * 300),
        leftmargin = 5StatsPlots.Plots.PlotMeasures.mm
    )
    return p
end

In [None]:
# Danio rerio
# 7955
# GCF_000002035.6

# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   2.059938 seconds (617 allocations: 309.625 KiB)
#   0.000201 seconds (116 allocations: 9.531 KiB)
# [ Info: /global/cfs/cdirs/m4269/cjprybol/Mycelia/projects/variant-calling-benchmarking/data/genomes/GCF_000002035.6/GCF_000002035.6_GRCz11_genomic.fna.gz.k7.canonical.jf.count_histogram.tsv already exists
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   2.051873 seconds (617 allocations: 309.641 KiB)
#   1.875662 seconds (181.84 k allocations: 11.496 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   1.962617 seconds (613 allocations: 309.562 KiB)
#  21.618736 seconds (116.74 k allocations: 9.253 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   1.977223 seconds (734 allocations: 320.039 KiB)
# 479.764688 seconds (100.25 k allocations: 8.412 MiB)
accession = "GCF_000002035.6"
p = accession_to_kmer_frequency_plots(accession=accession, basedir=genome_dir)

In [None]:
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   2.707734 seconds (632 allocations: 310.141 KiB)
#   0.462371 seconds (101.25 k allocations: 8.673 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   2.008665 seconds (615 allocations: 309.594 KiB)
#   2.110626 seconds (229.84 k allocations: 13.199 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   2.025346 seconds (615 allocations: 309.594 KiB)
#  21.925132 seconds (175.54 k allocations: 11.459 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   2.024590 seconds (615 allocations: 309.594 KiB)
# 832.700463 seconds (170.55 k allocations: 11.121 MiB)

# Mus musculus
# 10090
# GCF_000001635.27
accession = "GCF_000001635.27"
p = accession_to_kmer_frequency_plots(accession=accession, basedir=genome_dir)

In [None]:
# Homo sapiens
# 9606
# GCF_000001405.40

# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   2.123828 seconds (620 allocations: 309.953 KiB)
#   0.425395 seconds (101.41 k allocations: 8.880 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   1.656025 seconds (606 allocations: 309.172 KiB)
#   1.583714 seconds (251.21 k allocations: 15.355 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   1.745315 seconds (609 allocations: 309.500 KiB)
#  20.587222 seconds (181.83 k allocations: 11.732 MiB)
# [ Info: conda environment kmer-jellyfish already present; set force=true to update/re-install
#   1.816905 seconds (608 allocations: 309.203 KiB)
# 991.531925 seconds (176.60 k allocations: 11.345 MiB)

accession = "GCF_000001405.40"
p = accession_to_kmer_frequency_plots(accession=accession, basedir=genome_dir)