In [None]:
# ENV["LD_LIBRARY_PATH"] = ""

In [None]:
import Pkg
Pkg.activate(".")
# Pkg.update()

# Pkg.develop(url="https://github.com/cjprybol/Mycelia.git")
# import Mycelia
pkgs = [
"FASTX",
"Graphs",
"MetaGraphs",
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
function parse_gfa(gfa)
    
    gfa_record_types = Dict(
        '#' => "Comment",
        'H' => "Header",
        'S' => "Segment",
        'L' => "Link",
        'J' => "Jump",
        'C' => "Containment",
        'P' => "Path",
        'W' => "Walk"
    )

    gfa_graph = MetaGraphs.MetaDiGraph()
    MetaGraphs.set_prop!(gfa_graph, :paths, Dict{String, Any}())
    for line in eachline(gfa)
        record_type = gfa_record_types[line[1]]
        if record_type == "Header"
            # metadata
            sline = split(line)
            # add me later
        elseif record_type == "Comment"
            # metadata
            # add me later
        elseif record_type == "Segment"
            # node
            record_type, record_name, sequence = split(line, '\t')
            Graphs.add_vertex!(gfa_graph)
            node_index = Graphs.nv(gfa_graph)
            MetaGraphs.set_prop!(gfa_graph, node_index, :identifier, record_name)
            MetaGraphs.set_indexing_prop!(gfa_graph, :identifier)
            MetaGraphs.set_prop!(gfa_graph, node_index, :sequence, sequence)
        elseif record_type == "Link"
            record_type, source_identifier, source_orientation, destination_identifier, destination_orientation, overlap_CIGAR = split(line, '\t')
            source_index = gfa_graph[source_identifier, :identifier]
            destination_index = gfa_graph[destination_identifier, :identifier]
            edge = Graphs.Edge(source_index, destination_index)
            Graphs.add_edge!(gfa_graph, edge)
            MetaGraphs.set_prop!(gfa_graph, edge, :source_identifier, source_identifier)
            MetaGraphs.set_prop!(gfa_graph, edge, :source_orientation, source_orientation)
            MetaGraphs.set_prop!(gfa_graph, edge, :destination_identifier, destination_identifier)
            MetaGraphs.set_prop!(gfa_graph, edge, :destination_orientation, destination_orientation)
            MetaGraphs.set_prop!(gfa_graph, edge, :overlap_CIGAR, overlap_CIGAR)
        elseif record_type == "Path"
            record_type, path_identifier, segments, overlaps = split(line, '\t')
            gfa_graph.gprops[:paths][path_identifier] = Dict("segments" => segments, "overlaps" => overlaps)
        else
            @warn "GFA line type $(record_type) not currently handled by the import - please add"
        end
    end
    return gfa_graph
end

In [None]:
data_directory = joinpath(dirname(pwd()), "data")

In [None]:
sample_directories = readdir(data_directory, join=true)
trim_galore_directories = filter(x -> isdir(x) && !occursin(".ipynb_checkpoints", x), reduce(vcat, readdir.(sample_directories, join=true)))

In [None]:
megahit_directories = filter(x -> occursin.("_megahit", x), reduce(vcat, readdir.(trim_galore_directories, join=true)))

In [None]:
readdir(first(megahit_directories))

In [None]:
for megahit_directory in megahit_directories
    println(megahit_directory)

    initial_assembled_fasta = "$(megahit_directory)/final.contigs.fa"
    assembled_fastg = replace(initial_assembled_fasta, ".fa" => ".fastg")

    # read in the assembled fasta file and parse contig identifiers to get final k length
    final_k_lengths = unique([replace(first(split(FASTX.identifier(record), '_')), r"^k" => "") for record in FASTX.FASTA.Reader(open(initial_assembled_fasta))])
    @assert length(final_k_lengths) == 1
    final_k_length = parse(Int, first(final_k_lengths))
    if !isfile(assembled_fastg)
        # run(pipeline(`megahit_toolkit contig2fastg $(final_k_length) $(initial_assembled_fasta)`, assembled_fastg))
        run(pipeline(`conda run --live-stream --no-capture-output -n megahit megahit_toolkit contig2fastg $(final_k_length) $(initial_assembled_fasta)`, assembled_fastg))
    end

    # conda create -n bandage -c bioconda bandage
    assembled_gfa = "$(assembled_fastg).gfa"
    if !isfile(assembled_gfa) || (filesize(assembled_gfa) == 0)
        # run(`Bandage reduce $(assembled_fastg) $(assembled_gfa)`)
        run(`conda run --live-stream --no-capture-output -n bandage Bandage reduce $(assembled_fastg) $(assembled_gfa)`)
    end

    assembled_fasta = assembled_gfa * ".fna"
    if !isfile(assembled_fasta)
        open(assembled_fasta, "w") do io
            fastx_io = FASTX.FASTA.Writer(io)
            # gfa_graph = Mycelia.parse_gfa(assembled_gfa)
            gfa_graph = parse_gfa(assembled_gfa)
            for v in Graphs.vertices(gfa_graph)
                record = FASTX.FASTA.Record(gfa_graph.vprops[v][:identifier], gfa_graph.vprops[v][:sequence])
                write(fastx_io, record)
            end
            close(fastx_io)
        end
    end

    # generate a bandage plot of the assembly graph
    bandage_outfile = "$(assembled_gfa).bandage.jpg"
    if !isfile(bandage_outfile)
        run(`conda run --live-stream --no-capture-output -n bandage Bandage image $(assembled_gfa) $bandage_outfile`)
    end
end