In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
reference_fastas = sort(filter(x -> occursin(r"\.fna$", x) && !occursin("normalized", x), readdir(genome_dir, join=true)), by=x->filesize(x))
for reference_fasta in reference_fastas
    if !isdir(reference_fasta * "_RTG")
        run(`$(Mycelia.MAMBA) run --live-stream -n rtg-tools rtg format -o $(reference_fasta)_RTG $(reference_fasta)`)
    end
end
identifier_to_reference_fasta = Dict(first(split(basename(x), '.')) => x for x in reference_fastas)
reference_variants = sort(filter(x -> occursin(r"fna\.normalized\.vcf\.gz$", x), readdir(genome_dir, join=true)), by=x->filesize(x))

In [None]:
identifiers = [first(split(basename(x), '.')) for x in reference_variants]
identifier_to_reference_variants = Dict(first(split(basename(x), '.')) => x for x in reference_variants)

In [None]:
cactus_dirs = filter(x -> occursin(r"cactus"i, x) && isdir(x), readdir(genome_dir, join=true))
pggb_dirs = filter(x -> occursin(r"pggb"i, x), readdir(genome_dir, join=true))
clair3_dirs = filter(x -> occursin(r"clair3$"i, x) && isdir(x), readdir(genome_dir, join=true))
other_vcfs = filter(x -> occursin(r"vcf$", x) && isfile(x) && (occursin(r"bcftools", x) || occursin(r"freebayes", x)) && !occursin(r"\.sorted\.vcf$", x), readdir(genome_dir, join=true))
clair3_vcfs = joinpath.(clair3_dirs, "merge_output.vcf.gz")
pggb_vcfs = filter(x -> occursin(r"\.vcf$", x), reduce(vcat, readdir.(pggb_dirs, join=true)))
cactus_vcfs = filter(x -> occursin(r"cactus\.vcf\.gz$", x), reduce(vcat, readdir.(cactus_dirs, join=true)))
vcfs = vcat(cactus_vcfs, pggb_vcfs, clair3_vcfs, other_vcfs)

In [None]:
vcfs = filter(x -> !occursin("fixed", x), vcfs)

In [None]:
# Mycelia.add_bioconda_env("rtg-tools")

In [None]:
# make a table that lists calling method, identifier, file path

In [None]:
vcf_table = DataFrames.DataFrame(
    identifier = String[],
    variant_caller = String[],
    coverage = String[],
    vcf = String[]
)
for vcf in vcfs
    identifier = first(filter(identifier -> occursin(identifier, vcf), identifiers))
    variant_callers = [
        "cactus",
        "PGGB",
        "bcftools",
        "freebayes",
        "clair3"
    ]
    variant_caller = first(filter(variant_caller -> occursin(variant_caller, vcf), variant_callers))
    if variant_caller in ["cactus", "PGGB"]
        coverage = "N/A"
    else
        if occursin("10x", vcf)
            coverage = "10x"
        elseif occursin("100x", vcf)
            coverage = "100x"
        elseif occursin("1000x", vcf)
            coverage = "1000x"
        end
    end
    push!(vcf_table, (;identifier, variant_caller, coverage, vcf))
end
vcf_table

In [None]:
vcf_table

In [None]:
function fix_vcf(;vcf, outvcf = replace(vcf, r"\.vcf$" => ".fixed.vcf"))
    open(outvcf, "w") do io
        for line in eachline(vcf)
            if occursin(r"^##", line)
                println(io, line)
            elseif occursin(r"^#", line)
                println(io, line * "\tsample")
            else
                # println(io, replace(line, r"\s+" => "\t"))
                println(io, line * "\t1")
            end
        end
    end
    return outvcf
end

In [None]:
vcf_table

In [None]:
outdirs = String[]
for row in DataFrames.eachrow(vcf_table)
    vcf = row["vcf"]
    reference_fasta = identifier_to_reference_fasta[row["identifier"]]
    baseline_variants = identifier_to_reference_variants[row["identifier"]]
    if row["variant_caller"] == "PGGB"
        vcf = fix_vcf(vcf=vcf)
        processed_vcf = replace(vcf, r"\.vcf(?:\.gz)?$" => ".sorted.normalized.vcf.gz")
        outdir = joinpath(baseline_variants * "_RTG", basename(processed_vcf))
    else
        processed_vcf = replace(vcf, r"\.vcf(?:\.gz)?$" => ".sorted.normalized.vcf.gz")
        outdir = joinpath(baseline_variants * "_RTG", basename(processed_vcf))
    end
    push!(outdirs, outdir)
    if isdir(outdir)
        rm(outdir, recursive=true)
    end
    try
        processed_vcf = Mycelia.normalize_vcf(reference_fasta=reference_fasta, vcf_file=vcf)
        run(`$(Mycelia.CONDA_RUNNER) run --live-stream -n rtg-tools rtg RTG_MEM=8G vcfeval --all-records --vcf-score-field QUAL --threads=1 --template $(reference_fasta)_RTG --baseline $(baseline_variants) --calls $(processed_vcf) --squash-ploidy --output $(outdir)`)
        non_snp_roc = joinpath(outdir, "non_snp_roc.tsv.gz")
        snp_roc = joinpath(outdir, "snp_roc.tsv.gz")
        weighted_roc = joinpath(outdir, "weighted_roc.tsv.gz")
        roc_png = joinpath(outdir, "roc.png")
        roc_svg = joinpath(outdir, "roc.svg")
        run(`$(Mycelia.CONDA_RUNNER) run --live-stream -n rtg-tools rtg RTG_MEM=8G rocplot --png $(roc_png) --svg $(roc_svg) --curve $(non_snp_roc)=non_snp_roc --curve $(snp_roc)=snp_roc --curve $(weighted_roc)=weighted_roc `)
    catch e
        println(e)
    end
end
outdirs

In [None]:
# do PGGB & ???

In [None]:
# count([isdir(dir) for dir in outdirs])

Performance Metrics: Start by calculating key metrics for each variant calling method, including precision (positive predictive value), recall (sensitivity), accuracy, and F1 score. These metrics will provide a comprehensive view of each method's performance.

Confusion Matrix: For each method, create a confusion matrix that shows the number of true positives, false positives, true negatives, and false negatives. This will give you a clear picture of the type of errors made by each method.

Receiver Operating Characteristic (ROC) Curve: Plot ROC curves for each method. This plot shows the trade-off between true positive rate and false positive rate and is useful for comparing the performance of different methods.

Precision-Recall Curve: Especially when dealing with imbalanced datasets (where variants are rare), precision-recall curves can be more informative than ROC curves.

Heatmaps: Use heatmaps to visualize the performance metrics across different genomes and depths of coverage. This can provide an intuitive understanding of how performance varies with these factors.

jaccard similarity?
AUC