In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
# @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
# ENV["CONDA_JL_USE_MINIFORGE"] = "1"
pkgs = [
    "Revise",
    "FASTX",
    # "VariantCallFormat",
    "StatsBase",
    "Distributions",
    "StatsPlots",
    "Random",
    "Dates",
    "DataFrames",
    "BioSequences",
    "Conda",
    "Downloads"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
Pkg.build("Mycelia")

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
modified_fasta_files = sort(filter(x -> occursin(r"\.fna$", x) && occursin(".vcf", x) && occursin("normalized", x), readdir(genome_dir, join=true)), by=x->filesize(x))

In [None]:
coverages = [10, 100, 1000]

In [None]:
# !badread --help
# have to manually encode circularity in the fasta file
# https://github.com/rrwick/Badread#reference-fasta
# GPL
# 50x e coli > 1 hr
# run(`pip3 install git+https://github.com/rrwick/Badread.git`)
# fasta_file = first(fasta_files)
# coverage = first(coverages)

threads = 1
memory = 8
for fasta_file in modified_fasta_files
    for coverage in coverages
        outfile = fasta_file * ".badread.$(coverage)x.fq.gz"
        if !isfile(outfile) || (filesize(outfile) == 0)
            cmd = 
            """
            $(Mycelia.MAMBA) run --live-stream -n badread badread simulate --reference $(fasta_file) --quantity $(coverage)x | gzip > $(outfile)
            """
            Mycelia.sbatch(
                job_name = "$(basename(outfile))",
                mail_user = "cameron.prybol@gmail.com",
                logdir = mkpath("$(homedir())/workspace/slurmlogs"),
                partition = "batch",
                account = "mpsnyder",
                mem_gb = memory,
                cpus_per_task= threads,
                cmd = cmd)
        else
            @info "$(outfile) already present"
        end
    end
end

In [None]:
# couldn't get nanosim to work
# nanosim-h is too out of date

# nanosim_model_dir = mkpath(joinpath(homedir(), "workspace", "nanosim"))
# nanosim_model_url = "https://github.com/bcgsc/NanoSim/raw/master/pre-trained_models/human_giab_hg002_sub1M_kitv14_dorado.tar.gz"
# nanosim_model_file = last(split(nanosim_model_url, '/'))
# nanosim_model_file_path = joinpath(nanosim_model_dir, nanosim_model_file)
# nanosim_model_directory = replace(nanosim_model_file_path, ".tar.gz" => "")
# if !isdir(nanosim_model_directory)
#     Downloads.download(nanosim_model_url, nanosim_model_file_path)
#     run(`tar --extract --file $(nanosim_model_file_path) --directory $(nanosim_model_dir)`)
# end

# # readdir(nanosim_model_directory, join=true)
# nanosim_model_prefix = nanosim_model_directory * "/hg002_nanosim_sub1M"

# fasta_file = first(fasta_files)

# genome_size = Mycelia.fasta_genome_size(fasta_file)
# read_length = 10_000
# coverage = 10
# read_count = Int(ceil(genome_size / read_length * coverage))

# nanosim_output = fasta_file * "nanosim.$(coverage)x"

# # run(`$(Mycelia.MAMBA) run --live-stream -n nanosim simulator.py genome --model_prefix $(nanosim_model_prefix) --number $(read_count) --ref_g $(fasta_file) --num_threads 1 --output $(nanosim_output) --median_len 10000 --sd_len 1.05 --max_len 100000 --min_len 1000`)

# # for fasta_file in fasta_files
# #     for coverage in [10, 100, 1000]
# fasta_file = first(fasta_files)
# coverage = first(coverages)
# outprefix = "$(fasta_file).art.$(coverage)x."
# forward = outprefix * "1.fq.gz"
# reverse = outprefix * "2.fq.gz"
# if isfile(forward) && isfile(reverse) && (filesize(forward) > 0) && (filesize(reverse) > 0)
#     @info "$(forward) and $(reverse) found, skipping..."
#     continue
# end
# run(`$(Mycelia.MAMBA) run --live-stream -n art art_illumina --noALN --seqSys HS25 --paired --len 150 --mflen 500 --sdev 10 --in $(fasta_file) --fcov $(coverage) --out $(fasta_file).art.$(coverage)x.`)
# run(`gzip $(fasta_file).art.$(coverage)x.1.fq`)
# run(`gzip $(fasta_file).art.$(coverage)x.2.fq`)
# #     end
# # end