In [None]:
DATE = "2021-05-19"
TASK = "kayvirus-pangenome"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

In [None]:
import Pkg
pkgs = [
    "LightGraphs",
    "MetaGraphs",
    "BioSequences",
    "uCSV",
    "DataFrames",
    "FASTX",
    "HTTP",
    "CodecZlib",
    "DataStructures",
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
ncbi_staph_phage_metadata = DataFrames.DataFrame(uCSV.read("$(dirname(pwd()))/metadata/ncbi-staph-phage.csv", header=1, quotes='"', typedetectrows=100)...)

In [None]:
ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Nuc_Completeness"] .== "complete", :]

In [None]:
ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Sequence_Type"] .== "RefSeq", :]

# now with 7mers

In [None]:
# k = 3 degree too high
# k = 5 degree too high
# k = 7 degree too high
# k = 11
# k = 13
# k = 17
# k = 21
k = 31
KMER_TYPE = BioSequences.DNAMer{k}

In [None]:
EDGE_MER = BioSequences.DNAMer{k+1}

In [None]:
accession = ncbi_staph_phage_metadata[1, "Accession"]
fastx = collect(get_sequence(db = "nuccore", accession = accession))
kmers = Set(collect(keys(Mycelia.count_canonical_kmers(KMER_TYPE, fastx))))
# for accession in ncbi_staph_phage_metadata[2:end, "Accession"]
#     fastx = collect(get_sequence(db = "nuccore", accession = accession))
#     kmers = union!(kmers, collect(keys(Mycelia.count_canonical_kmers(KMER_TYPE, fastx))))
# end
# for kmer in kmers
#     push!(kmers, BioSequences.reverse_complement(kmer))
# end
kmers = unique(sort(collect(kmers)))

In [None]:
kmer_graph = MetaGraphs.MetaDiGraph(length(kmers))
MetaGraphs.set_prop!(kmer_graph, :k, k)

In [None]:
for (vertex, kmer) in enumerate(kmers)
    MetaGraphs.set_prop!(kmer_graph, vertex, :kmer, kmer)
end

In [None]:
record = first(fastx)

In [None]:
sequence = FASTX.sequence(record)
record_identifier = FASTX.identifier(record) 
edge_iterator = BioSequences.each(EDGE_MER, sequence)

In [None]:
for sequence_edge in edge_iterator

    forward_sequence_edge = BioSequences.LongDNASeq(sequence_edge.fw)

    observed_source_kmer = BioSequences.DNAMer(forward_sequence_edge[1:end-1])

    observed_destination_kmer = BioSequences.DNAMer(forward_sequence_edge[2:end])

    oriented_source_kmer = 
        (canonical_kmer = BioSequences.canonical(observed_source_kmer),
         orientation = BioSequences.iscanonical(observed_source_kmer))

    oriented_destination_kmer = 
        (canonical_kmer = BioSequences.canonical(observed_destination_kmer),
         orientation = BioSequences.iscanonical(observed_destination_kmer))

    oriented_source_vertex = 
        (vertex = searchsortedfirst(kmers, oriented_source_kmer.canonical_kmer),
         orientation = oriented_source_kmer.orientation)

    oriented_destination_vertex = 
        (vertex = searchsortedfirst(kmers, oriented_destination_kmer.canonical_kmer),
         orientation = oriented_destination_kmer.orientation)

    source_evidence = 
        (record = record_identifier,
         index = sequence_edge.position,
         orientation = oriented_source_vertex.orientation)

    destination_evidence = 
        (record = record_identifier,
         index = sequence_edge.position + 1,
         orientation = oriented_destination_vertex.orientation)

    add_evidence!(kmer_graph, oriented_source_vertex.vertex, source_evidence)

    add_evidence!(kmer_graph, oriented_destination_vertex.vertex, destination_evidence)

    forward_edge = LightGraphs.Edge(oriented_source_vertex.vertex, oriented_destination_vertex.vertex)

    LightGraphs.add_edge!(kmer_graph, forward_edge)

    forward_edge_orientations = 
        (source_orientation = oriented_source_vertex.orientation,
         destination_orientation = oriented_destination_vertex.orientation)

    MetaGraphs.set_prop!(kmer_graph, forward_edge, :orientations, forward_edge_orientations)

    forward_edge_evidence = (
        record = record_identifier,
        index = sequence_edge.position,
        orientation = true
    )

    add_evidence!(kmer_graph, forward_edge, forward_edge_evidence)

    reverse_edge = LightGraphs.Edge(oriented_destination_vertex.vertex, oriented_source_vertex.vertex)

    LightGraphs.add_edge!(kmer_graph, reverse_edge)

    reverse_edge_orientations = 
        (source_orientation = !oriented_destination_vertex.orientation,
         destination_orientation = !oriented_source_vertex.orientation)

    MetaGraphs.set_prop!(kmer_graph, reverse_edge, :orientations, reverse_edge_orientations)

    reverse_edge_evidence = (
        record = record_identifier,
        index = sequence_edge.position,
        orientation = false
    )

    add_evidence!(kmer_graph, reverse_edge, reverse_edge_evidence)
end

In [None]:
# NOTE!! here 1 => 1 has both + => + and - => - but only the - is recorded because the value over-wrote the initial
# may need to make orientations a set

In [None]:
LightGraphs.degree_histogram(kmer_graph)

In [None]:
outfile = "test.$(k).gfa"
open(outfile, "w") do io
    println(io, "H\tVN:Z:1.0")
    for vertex in LightGraphs.vertices(kmer_graph)
        kmer = kmer_graph.vprops[vertex][:kmer]
        depth = length(kmer_graph.vprops[vertex][:evidence])
#         depth = 1
        fields = ["S", "$vertex", kmer, "RC:i:$(depth)"]
        line = join(fields, '\t')
        println(io, line)
    end
    for edge in LightGraphs.edges(kmer_graph)
        overlap = kmer_graph.gprops[:k] - 1
        
        link = ["L",
                    edge.src,
                    kmer_graph.eprops[edge][:orientations].source_orientation ? '+' : '-',
                    edge.dst,
                    kmer_graph.eprops[edge][:orientations].destination_orientation ? '+' : '-',
                    "$(overlap)M"]
        line = join(link, '\t')
        println(io, line)
    end
end

In [None]:
kmer_graph

In [None]:
function find_downstream_vertices(kmer_graph, vertex, orientation)
    viable_neighbors = Int[]
    for neighbor in LightGraphs.neighbors(kmer_graph, vertex)
        not_same_vertex = vertex != neighbor
        candidate_edge = LightGraphs.Edge(vertex, neighbor)
        edge_src_orientation = kmer_graph.eprops[candidate_edge][:orientations].source_orientation
        viable_orientation = edge_src_orientation == orientation
        if not_same_vertex && viable_orientation
            push!(viable_neighbors, neighbor)
        end
    end
    return viable_neighbors
end

In [None]:
function find_unbranched_neighbors(kmer_graph, vertex, orientation)
    downstream_vertices = find_downstream_vertices(kmer_graph, vertex, orientation)
    if length(downstream_vertices) == 1
        downstream_vertex = first(downstream_vertices)
        destination_orientation = kmer_graph.eprops[LightGraphs.Edge(vertex, downstream_vertex)][:orientations].destination_orientation
        backtrack_vertices = find_downstream_vertices(kmer_graph, downstream_vertex, !destination_orientation)
        if backtrack_vertices == [vertex]
            return downstream_vertices
        else
            return Int[]
        end
    else
        return Int[]
    end
end

In [None]:
function oriented_unbranching_walk(kmer_graph, vertex, orientation)
    walk = []
    viable_neighbors = find_unbranched_neighbors(kmer_graph, vertex, orientation)
    while length(viable_neighbors) == 1
#         @show "found a viable neighbor!!"
        viable_neighbor = first(viable_neighbors)
        edge = LightGraphs.Edge(vertex, viable_neighbor)
        push!(walk, edge)
        vertex = edge.dst
        orientation = kmer_graph.eprops[edge][:orientations].destination_orientation
        viable_neighbors = find_unbranched_neighbors(kmer_graph, vertex, orientation)
    end
    return walk
end

In [None]:
untigs = []

In [None]:
visited = unique(sort(vcat([e.src for untig in untigs for e in untig], [e.dst for untig in untigs for e in untig])))
unvisited = setdiff(1:LightGraphs.nv(kmer_graph), visited)
if !isempty(unvisited)
    first_unvisited = first(setdiff(1:LightGraphs.nv(kmer_graph), visited))
    forward_walk = oriented_unbranching_walk(kmer_graph, first_unvisited, true)
    reverse_walk = oriented_unbranching_walk(kmer_graph, first_unvisited, false)
    inverted_reverse_walk = [LightGraphs.Edge(e.dst, e.src) for e in reverse(reverse_walk)]
    untig = vcat(inverted_reverse_walk, forward_walk)
    push!(untigs, untig)
else
    println("done!")
end

In [None]:
# println(join(vcat(first(untig).src, [e.dst for e in untig]...), ','))

In [None]:
path = first(untigs)

In [None]:
edge = first(path)

In [None]:
sequence = BioSequences.LongDNASeq(kmers[edge.src])

In [None]:
if !kmer_graph.eprops[edge][:orientations].source_orientation
    sequence = BioSequences.reverse_complement(sequence)
end

In [None]:
for edge in path
    destination = BioSequences.LongDNASeq(kmers[edge.dst])
    if !kmer_graph.eprops[edge][:orientations].destination_orientation
        destination = BioSequences.reverse_complement(destination)
    end
    sequence_suffix = sequence[end-length(destination)+2:end]
    destination_prefix = destination[1:end-1]
    @assert sequence_suffix == destination_prefix
    push!(sequence, destination[end])
end
sequence

In [None]:
Mycelia.is_equivalent(FASTX.sequence(record), sequence)