In [None]:
DATE = "2021-05-20"
TASK = "phage-k-pangenome"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

In [None]:
import Mycelia

In [None]:
import LightGraphs
import MetaGraphs
import BioSequences
import uCSV
import DataFrames
import FASTX
import HTTP
import CodecZlib
import DataStructures
import Revise

In [None]:
ncbi_staph_phage_metadata = DataFrames.DataFrame(uCSV.read("$(dirname(pwd()))/metadata/ncbi-staph-phage.csv", header=1, quotes='"', typedetectrows=100)...)
ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Nuc_Completeness"] .== "complete", :]
ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Sequence_Type"] .== "RefSeq", :]
ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Genus"] .== "Kayvirus", :];
show(ncbi_staph_phage_metadata[1:3, :], allcols=true)

In [None]:
fastx_iterator = (Mycelia.get_sequence(db="nuccore", accession=accession) for accession in ncbi_staph_phage_metadata[!, "Accession"])

In [None]:
KMER_TYPE = BioSequences.DNAMer{31}

In [None]:
@time kmer_graph = Mycelia.fastx_to_kmer_graph(KMER_TYPE, fastx_iterator)

In [None]:
function simplify_kmer_graph(kmer_graph)
    @info "determining untigs"
    @time untigs = resolve_untigs(kmer_graph)
    display(length(untigs))
    @info "orienting untigs"
    @time oriented_untigs = determine_oriented_untigs(kmer_graph, untigs)
    simplified_graph = MetaGraphs.MetaDiGraph(length(oriented_untigs))
    MetaGraphs.set_prop!(simplified_graph, :k, kmer_graph.gprops[:k])
    for (vertex, untig) in enumerate(oriented_untigs)
        MetaGraphs.set_prop!(simplified_graph, vertex, :sequence, untig.sequence)
        MetaGraphs.set_prop!(simplified_graph, vertex, :path, untig.path)
        MetaGraphs.set_prop!(simplified_graph, vertex, :orientations, untig.orientations)
    end
    
    # determine oriented edges of simplified graph
    simplified_untigs = []
    for vertex in LightGraphs.vertices(simplified_graph)
        in_kmer = simplified_graph.vprops[vertex][:path][1] => simplified_graph.vprops[vertex][:orientations][1]
        out_kmer = simplified_graph.vprops[vertex][:path][end] => simplified_graph.vprops[vertex][:orientations][end]
    #     @show vertex, in_kmer, out_kmer
        push!(simplified_untigs, in_kmer => out_kmer)
    end

    @info "determing connections between untigs"
    for (ui, u) in enumerate(simplified_untigs)
        for (vi, v) in enumerate(simplified_untigs)
    #         + => +
            source_kmer_index, source_orientation = last(u)
            destination_kmer_index, destination_orientation = first(v)
            edge = LightGraphs.Edge(source_kmer_index, destination_kmer_index)
            if LightGraphs.has_edge(kmer_graph, edge)
                source_orientation_matches = (kmer_graph.eprops[edge][:orientations].source_orientation == source_orientation)
                destination_orientation_matches = (kmer_graph.eprops[edge][:orientations].destination_orientation == destination_orientation)
                if source_orientation_matches && destination_orientation_matches
                    @show "right orientation!! + +"

                    simplified_graph_edge = LightGraphs.Edge(ui, vi)

                    LightGraphs.add_edge!(simplified_graph, simplified_graph_edge)
                    edge_orientations = (
                        source_orientation = source_orientation,
                        destination_orientation = destination_orientation
                    )
                    MetaGraphs.set_prop!(simplified_graph, simplified_graph_edge, :orientations, edge_orientations)
                end
            end
    #         + => -
            source_kmer_index, source_orientation = last(u)
            destination_kmer_index, destination_orientation = last(v)
            destination_orientation = !destination_orientation

            edge = LightGraphs.Edge(source_kmer_index, destination_kmer_index)
            if LightGraphs.has_edge(kmer_graph, edge)
                source_orientation_matches = (kmer_graph.eprops[edge][:orientations].source_orientation == source_orientation)
                destination_orientation_matches = (kmer_graph.eprops[edge][:orientations].destination_orientation == destination_orientation)
                if source_orientation_matches && destination_orientation_matches
                    @show "right orientation!! + -"
                    simplified_graph_edge = LightGraphs.Edge(ui, vi)

                    LightGraphs.add_edge!(simplified_graph, simplified_graph_edge)
                    edge_orientations = (
                        source_orientation = source_orientation,
                        destination_orientation = destination_orientation
                    )
                    MetaGraphs.set_prop!(simplified_graph, simplified_graph_edge, :orientations, edge_orientations)
                end
            end
    #         - => +
            source_kmer_index, source_orientation = first(u)
            source_orientation = !source_orientation
            destination_kmer_index, destination_orientation = first(v)

            edge = LightGraphs.Edge(source_kmer_index, destination_kmer_index)
            if LightGraphs.has_edge(kmer_graph, edge)
                source_orientation_matches = (kmer_graph.eprops[edge][:orientations].source_orientation == source_orientation)
                destination_orientation_matches = (kmer_graph.eprops[edge][:orientations].destination_orientation == destination_orientation)
                if source_orientation_matches && destination_orientation_matches
                    @show "right orientation!! - +"
                    simplified_graph_edge = LightGraphs.Edge(ui, vi)

                    LightGraphs.add_edge!(simplified_graph, simplified_graph_edge)
                    edge_orientations = (
                        source_orientation = source_orientation,
                        destination_orientation = destination_orientation
                    )
                    MetaGraphs.set_prop!(simplified_graph, simplified_graph_edge, :orientations, edge_orientations)
                end
            end
    #         - => -
            source_kmer_index, source_orientation = first(u)
            source_orientation = !source_orientation
            destination_kmer_index, destination_orientation = last(v)
            destination_orientation = !destination_orientation

            edge = LightGraphs.Edge(source_kmer_index, destination_kmer_index)
            if LightGraphs.has_edge(kmer_graph, edge)
                source_orientation_matches = (kmer_graph.eprops[edge][:orientations].source_orientation == source_orientation)
                destination_orientation_matches = (kmer_graph.eprops[edge][:orientations].destination_orientation == destination_orientation)
                if source_orientation_matches && destination_orientation_matches
                    @show "right orientation!! - -"
                    simplified_graph_edge = LightGraphs.Edge(ui, vi)

                    LightGraphs.add_edge!(simplified_graph, simplified_graph_edge)
                    edge_orientations = (
                        source_orientation = source_orientation,
                        destination_orientation = destination_orientation
                    )
                    MetaGraphs.set_prop!(simplified_graph, simplified_graph_edge, :orientations, edge_orientations)
                end
            end
        end
    end
    return simplified_graph
end

In [None]:
function resolve_untigs(kmer_graph)
    untigs = []
    visited = unique(sort([vertex for untig in untigs for vertex in untig]))
    unvisited = setdiff(1:LightGraphs.nv(kmer_graph), visited)
    while !isempty(unvisited)
        first_unvisited = first(setdiff(1:LightGraphs.nv(kmer_graph), visited))
        forward_walk = oriented_unbranching_walk(kmer_graph, first_unvisited, true)
        reverse_walk = oriented_unbranching_walk(kmer_graph, first_unvisited, false)
        inverted_reverse_walk = [LightGraphs.Edge(e.dst, e.src) for e in reverse(reverse_walk)]
        untig_edges = vcat(inverted_reverse_walk, forward_walk)
        if isempty(untig_edges)
            untig = [first_unvisited]
        else
            untig = vcat([first(untig_edges).src], [e.dst for e in untig_edges])
        end
        push!(untigs, untig)
        visited = unique(sort([vertex for untig in untigs for vertex in untig]))
        unvisited = setdiff(1:LightGraphs.nv(kmer_graph), visited)
    end
    return untigs
end

function determine_oriented_untigs(kmer_graph, untigs)
    oriented_untigs = []
    for path in untigs
        @show path
        sequence = BioSequences.LongDNASeq(kmer_graph.vprops[first(path)][:kmer])
        if length(path) == 1
            orientations = [true]
        elseif length(path) > 1
            initial_edge = LightGraphs.Edge(path[1], path[2])
            initial_orientation = kmer_graph.eprops[initial_edge][:orientations].source_orientation
            orientations = [initial_orientation]
            if !initial_orientation
                sequence = BioSequences.reverse_complement(sequence)
            end

            for (src, dst) in zip(path[1:end-1], path[2:end])
                edge = LightGraphs.Edge(src, dst)
                destination = BioSequences.LongDNASeq(kmer_graph.vprops[edge.dst][:kmer])
                destination_orientation = kmer_graph.eprops[edge][:orientations].destination_orientation
                push!(orientations, destination_orientation)
                if !destination_orientation
                    destination = BioSequences.reverse_complement(destination)
                end
                sequence_suffix = sequence[end-length(destination)+2:end]
                destination_prefix = destination[1:end-1]
                @assert sequence_suffix == destination_prefix
                push!(sequence, destination[end])
            end
        end

        oriented_untig = 
        (
            sequence = BioSequences.canonical(sequence),
            path = BioSequences.iscanonical(sequence) ? path : reverse(path),
            orientations = BioSequences.iscanonical(sequence) ? orientations : reverse(.!orientations)
        )

        push!(oriented_untigs, oriented_untig)
    end
    return oriented_untigs
end

In [None]:
simplified_graph = simplify_kmer_graph(kmer_graph)

In [None]:
# write simplified graph to gfa