In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
# @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    # "FASTX",
    # "StatsBase",
    # "Distributions",
    # "StatsPlots",
    # "Random",
    # "Dates",
    # "DataFrames",
    # "BioSequences",
    # "Conda",
    # "Downloads"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
run(`$(Mycelia.MAMBA) create -c conda-forge -c bioconda -c defaults --strict-channel-priority -n rtg-tools rtg-tools -y`)

In [None]:
reference_fastas = sort(filter(x -> occursin(r"\.fna$", x) && !occursin("normalized", x), readdir(genome_dir, join=true)), by=x->filesize(x))

In [None]:
# 1
reference_fasta = reference_fastas[2]
if !isdir(reference_fasta * "_RTG")
    run(`$(Mycelia.MAMBA) run --live-stream -n rtg-tools rtg format -o $(reference_fasta)_RTG $(reference_fasta)`)
end

baseline_variants = first(filter(x -> occursin(reference_fasta, x) && occursin(r"\.normalized\.vcf\.gz$", x), readdir(genome_dir, join=true)))

# need to add variants from Clair3 which are currently in folders
variant_calls = filter(x -> filesize(x) > 0 && isfile(x) && !isdir(x), sort(filter(x -> occursin(reference_fasta, x) && occursin(r"\.vcf$", x), readdir(genome_dir, join=true)), by=x->filesize(x)))

for calls in variant_calls
    sorted_calls = replace(calls, ".vcf" => ".sorted.vcf")
    sorted_bgzip_calls = sorted_calls * ".gz"
    outdir = joinpath(baseline_variants * "_RTG", basename(sorted_bgzip_calls))
    # @show outdir
        
    if !isfile(sorted_calls)
        run(`$(Mycelia.MAMBA) run --live-stream -n bcftools bcftools sort $(calls) --output $(sorted_calls)`)
    end
    if !isfile(sorted_bgzip_calls)
        run(`$(Mycelia.MAMBA) run --live-stream -n rtg-tools rtg bgzip $(sorted_calls)`)
    end
    if !isfile(sorted_bgzip_calls * ".tbi")
        run(`$(Mycelia.MAMBA) run --live-stream -n rtg-tools rtg index $(sorted_bgzip_calls)`)
    end
    if !isdir(outdir)
        # rm(outdir, recursive=true)
        run(`$(Mycelia.MAMBA) run --live-stream -n rtg-tools rtg RTG_MEM=8G vcfeval --all-records --vcf-score-field QUAL --threads=1 --template $(reference_fasta)_RTG --baseline $(baseline_variants) --calls $(sorted_bgzip_calls) --squash-ploidy -o $(outdir)`)
        non_snp_roc = joinpath(outdir, "non_snp_roc.tsv.gz")
        snp_roc = joinpath(outdir, "snp_roc.tsv.gz")
        weighted_roc = joinpath(outdir, "weighted_roc.tsv.gz")
        roc_png = joinpath(outdir, "roc.png")
        roc_svg = joinpath(outdir, "roc.svg")
        run(`$(Mycelia.MAMBA) run --live-stream -n rtg-tools rtg RTG_MEM=8G rocplot --png $(roc_png) --svg $(roc_svg) --curve $(non_snp_roc)=non_snp_roc --curve $(snp_roc)=snp_roc --curve $(weighted_roc)=weighted_roc `)
    else
        @info "$(outdir) already exists"
    end
    # mkpath(outdir)
end

In [None]:
# pip install --user vcftoolz
# https://github.com/CFSAN-Biostatistics/vcftoolz
# https://vcftoolz.readthedocs.io/en/latest/usage.html#compare

In [None]:
# https://mlbasejl.readthedocs.io/en/latest/perfeval.html

In [None]:
# normalize and index all vcf files before comparing

In [None]:
# https://vcftools.sourceforge.net/documentation.html#:~:text=Comparing%20two%20files,%2C%20or%20%2D%2Ddiff%2Dbcf.
# https://vcftools.github.io/perl_module.html#vcf-compare
# ./vcftools --vcf input_data.vcf --diff other_data.vcf --out compare
# * vcf-compare provides additional statistics from BEDTools including the number of duplicate sites and Venn-Diagram Numbers, which show the number of exclusive variants in each respective VCF file
# vcf-compare snp.vcf.gz snp.vcf.gz

In [None]:
# https://davetang.org/muse/2019/09/02/comparing-vcf-files/

In [None]:
# * BEDTools can be used to compare VCF files but only by comparing genomic coordinates; this can provide a quick answer to how many variants overlap and can be used to calculate a Jaccard index, indicating the amount of overall overlap
# intersect reports results with respect to -a
# bedtools intersect -u -a first.vcf.gz -b second.vcf.gz | wc -l
# 70446
 
# # results differ when the second file is used as -a
# bedtools intersect -u -a second.vcf.gz -b first.vcf.gz | wc -l
# 70454
 
# bedtools jaccard -a first.vcf.gz -b second.vcf.gz
# intersection    union   jaccard n_intersections
# 70367   210677  0.334004        70156

In [None]:
# * bcftools isec also provides Venn-Diagram Numbers and additionally creates VCF files based on these intersections
# bcftools isec snp.vcf.gz snp.vcf.gz -p isec

In [None]:
# https://samtools.github.io/bcftools/bcftools.html#sort
# * SnpSift concordance provides intersection counts as well as genotype differences between two VCF files; this is particularly useful for comparing variant calls from two different tools
# SnpSift concordance -v snp.vcf snp.vcf > snp_concordance.txt
# It is assumed that both VCF files are sorted by chromosome and position.
# java -Xmx1g -jar SnpSift.jar concordance -v genotype.vcf sequencing.vcf > concordance.txt
# https://pcingola.github.io/SnpEff/snpsift/introduction/

Performance Metrics: Start by calculating key metrics for each variant calling method, including precision (positive predictive value), recall (sensitivity), accuracy, and F1 score. These metrics will provide a comprehensive view of each method's performance.

Confusion Matrix: For each method, create a confusion matrix that shows the number of true positives, false positives, true negatives, and false negatives. This will give you a clear picture of the type of errors made by each method.

Receiver Operating Characteristic (ROC) Curve: Plot ROC curves for each method. This plot shows the trade-off between true positive rate and false positive rate and is useful for comparing the performance of different methods.

Precision-Recall Curve: Especially when dealing with imbalanced datasets (where variants are rare), precision-recall curves can be more informative than ROC curves.

Heatmaps: Use heatmaps to visualize the performance metrics across different genomes and depths of coverage. This can provide an intuitive understanding of how performance varies with these factors.

jaccard similarity?
AUC