In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
# @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
ENV["CONDA_JL_USE_MINIFORGE"] = "1"
pkgs = [
    "Revise",
    "FASTX",
    "VariantCallFormat",
    "StatsBase",
    "Distributions",
    "StatsPlots",
    "Random",
    "Dates",
    "DataFrames",
    "BioSequences",
    "Conda"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
# seed = parse(Int, replace(string(Dates.today()), "-" => ""))
seed = 20240121
genome_size = 1_000_000
n_variants = 1000
window_size = Int(round(genome_size / n_variants))
Random.seed!(seed)

# make the original genome, or read it in
original_record = Mycelia.random_fasta_record(seed=seed, L=genome_size)
reference_id = FASTX.description(original_record)
fasta_file = joinpath(genome_dir, reference_id * ".fna")
vcf_file = fasta_file * ".vcf"
modified_fasta_file = vcf_file * ".fna"
Mycelia.write_fasta(outfile = fasta_file, records = [original_record])

original_sequence = FASTX.sequence(original_record)

# chosen empirically to find the right distribution size
n = Base.MathConstants.e^4 # ~ 1-500
variant_size_disbribution = Distributions.Geometric(1/n)
p = StatsPlots.plot(
    variant_size_disbribution,
    ylabel = "probability density",
    xlabel = "variant size",
    title = "Geometric Distriubtion (1/ℯ^4)",
    legend=false
)
display(p)
variant_sizes = rand(variant_size_disbribution, 1000) .+ 1
p = StatsPlots.histogram(
    variant_sizes,
    ylabel = "# of variants",
    xlabel = "variant size",
    title = "Actual samples drawn",
    nbins = length(unique(variant_sizes)),
    legend=false
)
display(p)

@assert all(variant_sizes .>= 1)

variant_type_likelihoods = [
    :substitution => 10^-1,
    :insertion => 10^-2,
    :deletion => 10^-2,
    :inversion => 10^-3,
    # special case insertion/deletions, skipping
    # :translocations => 10^-3,
    # :duplication => 10^-3,
]

# if we sample 1k variants, we generally hit a few or more of each
variant_types = StatsBase.sample(first.(variant_type_likelihoods), StatsBase.weights(last.(variant_type_likelihoods)), 1000)
StatsBase.countmap(variant_types)

# we'll use 1k windows of 1k basepairs
step = 1000
window_starts = 1:step:length(original_sequence)
window_ends = step:step:length(original_sequence)
windows = zip(window_starts, window_ends)
variant_type_sizes = zip(variant_types, variant_sizes)

vcf_table = DataFrames.DataFrame(
    "#CHROM" => String[],
    "POS" => Int[],
    "ID" => String[],
    "REF" => String[],
    "ALT" => String[],
    "QUAL" => Int[],
    "FILTER" => String[],
    "INFO" => String[],
    "FORMAT" => String[],
    "SAMPLE" => String[]
)

for ((variant_type, variant_size), (start, stop)) in collect(zip(variant_type_sizes, windows))
    selected_start = rand(start:stop-variant_size)
    @assert selected_start <= stop
    original_subsequence = original_sequence[selected_start:selected_start+variant_size-1]
    @assert length(original_subsequence) == variant_size
    if variant_type == :substitution
        ref = string(original_subsequence)
        alt = string(BioSequences.randdnaseq(variant_size))
        while alt == ref
            @info "substitution collision, redrawing..."
            alt = string(BioSequences.randdnaseq(variant_size))
        end
    elseif variant_type == :insertion
        ref = original_subsequence
        alt = string(original_subsequence) * string(BioSequences.randdnaseq(variant_size))
    elseif variant_type == :deletion
        # need to take the prefix base to show that the entire subsequence was deleted
        ref = original_sequence[selected_start-1:selected_start+variant_size-1]
        alt = original_sequence[selected_start-1:selected_start-1]
        selected_start -= 1
    elseif variant_type == :inversion
        ref = original_subsequence
        alt = reverse(original_subsequence)
    end 
    # @show selected_start
    row = Dict(
        "#CHROM" => reference_id,
        "POS" => selected_start,
        "ID" => ".",
        "REF" => string(ref),
        "ALT" => string(alt),
        "QUAL" => 60,
        "FILTER" => string(variant_type),
        "INFO" => ".",
        "FORMAT" => "GT:GQ",
        "SAMPLE" => "1:60"
    )
    push!(vcf_table, row)
end

@assert join(names(vcf_table), '\t') == "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE"
@assert all(vcf_table[!, "REF"] .!= vcf_table[!, "ALT"])

open(vcf_file, "w") do io
    VCF_HEADER = 
    """
    ##fileformat=VCFv4.3
    ##fileDate=$(Dates.today())
    ##source=simulated-variants
    ##reference=$(reference_id)
    ##FILTER=<ID=substitution,Description="substitution variant">
    ##FILTER=<ID=insertion,Description="insertion variant">
    ##FILTER=<ID=deletion,Description="deletion variant">
    ##FILTER=<ID=inversion,Description="inversion variant">
    ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
    ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
    """
    print(io, VCF_HEADER)
    println(io, join(names(vcf_table), '\t'))
    for row in DataFrames.eachrow(vcf_table)
        println(io, join([row[col] for col in names(vcf_table)], '\t'))
    end
end

Mycelia.add_bioconda_envs()

isfile("$(vcf_file).gz") && rm("$(vcf_file).gz")
isfile("$(vcf_file).gz.tbi") && rm("$(vcf_file).gz.tbi")
normalized_vcf_file = replace(vcf_file, ".vcf" => ".normalized.vcf")
run(`$(Conda.conda) run --live-stream -n htslib bgzip $(vcf_file)`)
run(`$(Conda.conda) run --live-stream -n tabix tabix -f -p vcf $(vcf_file).gz`)
run(pipeline(`$(Conda.conda) run --live-stream -n bcftools bcftools norm -cs --fasta-ref $(fasta_file) $(vcf_file).gz`, normalized_vcf_file))
rm("$(vcf_file).gz")
rm("$(vcf_file).gz.tbi")
isfile("$(normalized_vcf_file).gz") && rm("$(normalized_vcf_file).gz")
isfile("$(normalized_vcf_file).gz.tbi") && rm("$(normalized_vcf_file).gz.tbi")
isfile("$(normalized_vcf_file).fna") && rm("$(normalized_vcf_file).fna")
run(`$(Conda.conda) run --live-stream -n htslib bgzip $(normalized_vcf_file)`)
run(`$(Conda.conda) run --live-stream -n tabix tabix -p vcf $(normalized_vcf_file).gz`)
run(`$(Conda.conda) run --live-stream -n bcftools bcftools consensus -f $(fasta_file) $(normalized_vcf_file).gz -o $(normalized_vcf_file).fna`)

vcf_table