In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
# @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "FASTX",
    "ProgressMeter"
    # "StatsBase",
    # "Distributions",
    # "StatsPlots",
    # "Random",
    # "Dates",
    # "DataFrames",
    # "BioSequences",
    # "Conda",
    # "Downloads"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
import Mycelia

In [None]:
# Pkg.build("Mycelia")

In [None]:
PROJECT_BASEDIR = dirname(pwd())
data_dir = joinpath(PROJECT_BASEDIR, "data")
genome_dir = mkpath(joinpath(data_dir, "genomes"))

In [None]:
# https://pggb.readthedocs.io/en/latest/rst/installation.html#bioconda
# https://pggb.readthedocs.io/en/latest/rst/installation.html#docker
# Mycelia.add_bioconda_env("pggb")

In [None]:
# https://github.com/pangenome/PanSN-spec
function merge_fasta_files(;fasta_files, fasta_file)
    open(fasta_file, "w") do io
        fastx_io = FASTX.FASTA.Writer(io)
        ProgressMeter.@showprogress for f in fasta_files
            for (i, record) in enumerate(Mycelia.open_fastx(f))
                if occursin(r"normalized", f)
                    original_record_identifier = FASTX.identifier(record)
                    updated_record_identifier = join([basename(f), i, original_record_identifier], "#")
                    new_record = FASTX.FASTA.Record(updated_record_identifier, FASTX.sequence(record))
                    write(fastx_io, new_record)
                else
                    write(fastx_io, record)
                end
            end
        end
        close(fastx_io)
    end
    return fasta_file
end

In [None]:
reference_assemblies = filter(x -> occursin(r"\.fna$", x) && !occursin("normalized", x), readdir(genome_dir))

In [None]:
reference_variant_assemblies = filter(x -> occursin(r"\.fna$", x) && occursin("normalized", x) && !occursin("joint", x), readdir(genome_dir))

In [None]:
reference_assemblies_by_id = Dict(first(split(x, '.')) => x for x in reference_assemblies)

In [None]:
reference_variant_assemblies_by_id = Dict(first(split(x, '.')) => x for x in reference_variant_assemblies)

In [None]:
identifiers = intersect(keys(reference_assemblies_by_id), keys(reference_variant_assemblies_by_id))

In [None]:
identifiers = sort(collect(identifiers), by=x->length(x))

In [None]:
# identifier = identifiers[1]
# identifier = identifiers[2]
# identifier = identifiers[3]
# identifier = identifiers[4]
# identifier = identifiers[5]
# identifier = identifiers[6]

genomes = [
    joinpath(genome_dir, reference_assemblies_by_id[identifier]),
    joinpath(genome_dir, reference_variant_assemblies_by_id[identifier])
]

joint_fasta = 
join([reference_assemblies_by_id[identifier], reference_variant_assemblies_by_id[identifier]], "__") * ".joint.fna"
joint_fasta = joinpath(genome_dir, joint_fasta)

merge_fasta_files(fasta_files=genomes, fasta_file=joint_fasta)

# Mycelia.add_bioconda_env("samtools")

run(`$(Mycelia.MAMBA) run --live-stream -n samtools samtools faidx $(joint_fasta)`)

# for record in Mycelia.open_fastx(joint_fasta)
#     display(FASTX.identifier(record))
# end

variant_string = "$(identifier):#"

outdir = joint_fasta * "__PGGB"

# # cmd = "$(Mycelia.MAMBA) run --live-stream -n pggb pggb -i in.fa.gz -o out1 -t 2 -n 2"
# # -V $(variant_string)
# # can't get this to work
# -V $(variant_string)
cmd = `$(Mycelia.MAMBA) run --live-stream -n pggb pggb -i $(joint_fasta) -o $(outdir) -t 2 -n 2`
run(cmd)

gfa_file = first(filter(x -> occursin(r"\.gfa", x), readdir(outdir, join=true)))

path_prefix = first(split(reference_assemblies_by_id[identifier], '.'))

cmd = `$(Mycelia.MAMBA) run --live-stream -n vg vg deconstruct --path-prefix $(path_prefix) --ploidy 1 --path-traversals --all-snarls --threads 2 $(gfa_file)`
run(pipeline(cmd, gfa_file * ".vcf"))