In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
# @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "VariantCallFormat",
    "StatsBase",
    "Distributions",
    "StatsPlots",
    "Random",
    "Dates",
    "DataFrames",
    "BioSequences"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
# seed = parse(Int, replace(string(Dates.today()), "-" => ""))
seed = 20240121
genome_size = 1_000_000
n_variants = 1000
window_size = Int(round(genome_size / n_variants))
Random.seed!(seed)

# make the original genome, or read it in
original_record = Mycelia.random_fasta_record(seed=seed, L=genome_size)
reference_id = FASTX.description(original_record)
fasta_file = joinpath(genome_dir, reference_id * ".fna")
vcf_file = fasta_file * ".vcf"
modified_fasta_file = vcf_file * ".fna"
Mycelia.write_fasta(outfile = fasta_file, records = [original_record])

original_sequence = FASTX.sequence(original_record)

# chosen empirically to find the right distribution size
n = Base.MathConstants.e^4 # ~ 1-500
variant_size_disbribution = Distributions.Geometric(1/n)
p = StatsPlots.plot(
    variant_size_disbribution,
    ylabel = "probability density",
    xlabel = "variant size",
    title = "Geometric Distriubtion (1/ℯ^4)",
    legend=false
)
display(p)
variant_sizes = rand(variant_size_disbribution, 1000) .+ 1
p = StatsPlots.histogram(
    variant_sizes,
    ylabel = "# of variants",
    xlabel = "variant size",
    title = "Actual samples drawn",
    nbins = length(unique(variant_sizes)),
    legend=false
)
display(p)

@assert all(variant_sizes .>= 1)

variant_type_likelihoods = [
    :substitution => 10^-1,
    :insertion => 10^-2,
    :deletion => 10^-2,
    :inversion => 10^-3,
    # special case insertion/deletions, skipping
    # :translocations => 10^-3,
    # :duplication => 10^-3,
]

# if we sample 1k variants, we generally hit a few or more of each
variant_types = StatsBase.sample(first.(variant_type_likelihoods), StatsBase.weights(last.(variant_type_likelihoods)), 1000)
StatsBase.countmap(variant_types)

# we'll use 1k windows of 1k basepairs
step = 1000
window_starts = 1:step:length(original_sequence)
window_ends = step:step:length(original_sequence)
windows = zip(window_starts, window_ends)
variant_type_sizes = zip(variant_types, variant_sizes)

vcf_table = DataFrames.DataFrame(
    "#CHROM" => String[],
    "POS" => Int[],
    "ID" => String[],
    "REF" => String[],
    "ALT" => String[],
    "QUAL" => Int[],
    "FILTER" => String[],
    "INFO" => String[],
    "FORMAT" => String[]
)

for ((variant_type, variant_size), (start, stop)) in collect(zip(variant_type_sizes, windows))
    selected_start = rand(start:stop-variant_size)
    @assert selected_start <= stop
    original_subsequence = original_sequence[selected_start:selected_start+variant_size-1]
    @assert length(original_subsequence) == variant_size
    if variant_type == :substitution
        ref = original_subsequence
        alt = BioSequences.randdnaseq(variant_size)
    elseif variant_type == :insertion
        ref = "."
        alt = BioSequences.randdnaseq(variant_size)
    elseif variant_type == :deletion
        ref = original_subsequence
        alt = "."
    elseif variant_type == :inversion
        ref = original_subsequence
        alt = reverse(original_subsequence)
    end 
    # @show selected_start
    row = Dict(
        "#CHROM" => reference_id,
        "POS" => selected_start,
        "ID" => ".",
        "REF" => string(ref),
        "ALT" => string(alt),
        "QUAL" => 60,
        "FILTER" => string(variant_type),
        "INFO" => "",
        "FORMAT" => ""
    )
    push!(vcf_table, row)
end
vcf_table

In [None]:
open(vcf_file, "w") do io
    VCF_HEADER = 
    """
    ##fileformat=VCFv4.3
    ##fileDate=$(Dates.today())
    ##source=simulated-variants
    ##reference=$(reference_id)
    """
    print(io, VCF_HEADER)
    println(io, join(names(vcf_table), '\t'))
    for row in DataFrames.eachrow(vcf_table)
        println(io, join([row[col] for col in names(vcf_table)], '\t'))
    end
end

In [None]:
run(`conda create -c conda-forge -c bioconda -c defaults --strict-channel-priority -n htslib htslib -y`)
run(`conda create -c conda-forge -c bioconda -c defaults --strict-channel-priority -n tabix tabix -y`)
run(`conda create -c conda-forge -c bioconda -c defaults --strict-channel-priority -n bcftools bcftools -y`)
run(`conda create -c conda-forge -c bioconda -c defaults --strict-channel-priority -n vcftools vcftools -y`)

In [None]:
#     !mamba run --live-stream --no-capture-output -n htslib bgzip {short_read_vcf}.filtered.vcf
#     !mamba run --live-stream --no-capture-output -n tabix tabix -p vcf {short_read_vcf}.filtered.vcf.gz
#     !mamba run --live-stream --no-capture-output -n bcftools bcftools norm --fasta-ref {local_outdir}/flye/medaka/consensus.fasta {short_read_vcf}.filtered.vcf.gz > {short_read_vcf}.filtered.normalized.vcf

#     !mamba run --live-stream --no-capture-output -n htslib bgzip {short_read_vcf}.filtered.normalized.vcf
#     !mamba run --live-stream --no-capture-output -n tabix tabix -p vcf {short_read_vcf}.filtered.normalized.vcf.gz
#     !mamba run --live-stream --no-capture-output -n bcftools bcftools consensus -f {local_outdir}/flye/medaka/consen